From b016f4e191f58c078a79a486ddee9efaef1cfd14 Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Mon, 16 Jan 2023 19:09:26 +0800 Subject: [PATCH 01/21] start matx.inductor --- python/matx/script/context/inductor_context.py | 18 ++++++++++++++++++ python/matx/toolchain.py | 4 ++++ 2 files changed, 22 insertions(+) create mode 100644 python/matx/script/context/inductor_context.py diff --git a/python/matx/script/context/inductor_context.py b/python/matx/script/context/inductor_context.py new file mode 100644 index 00000000..84bf20b4 --- /dev/null +++ b/python/matx/script/context/inductor_context.py @@ -0,0 +1,18 @@ +# Copyright 2022 ByteDance Ltd. and/or its affiliates. +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. \ No newline at end of file diff --git a/python/matx/toolchain.py b/python/matx/toolchain.py index 8758f557..556ba4fe 100644 --- a/python/matx/toolchain.py +++ b/python/matx/toolchain.py @@ -380,6 +380,10 @@ def script(compiling_obj, *, share=True, toolchain=None, bundle_args=None): raise ValueError('Unsupported build_type: {}'.format(result.build_type)) +def inductor(compiling_obj, *, share=True, toolchain=None, bundle_args=None): + pass + + def make_session(compiling_obj, method='__call__'): from . import pipeline From bb78f4514a90b0d20d90b6e9df6f7f66145abfec Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Tue, 17 Jan 2023 21:35:30 +0800 Subject: [PATCH 02/21] almost finish inductor mvp --- python/matx/__init__.py | 5 ++- python/matx/contrib/cc.py | 11 ++++- python/matx/inductor/__init__.py | 45 +++++++++++++++++++ python/matx/inductor/context/__init__.py | 0 python/matx/runtime/module.py | 2 + python/matx/script/context/__init__.py | 1 + python/matx/script/context/ast_node.py | 3 +- .../matx/script/context/inductor_context.py | 12 ++++- python/matx/toolchain.py | 43 ++++++++++++++++-- 9 files changed, 114 insertions(+), 8 deletions(-) create mode 100644 python/matx/inductor/__init__.py create mode 100644 python/matx/inductor/context/__init__.py diff --git a/python/matx/__init__.py b/python/matx/__init__.py index 4870a959..5c84b691 100644 --- a/python/matx/__init__.py +++ b/python/matx/__init__.py @@ -30,7 +30,6 @@ from . import vision from . import tools - # APIs __all__ = [ # functions @@ -352,6 +351,10 @@ def script(compiling_obj, *args, backend=None, **kwargs): return toolchain.script(compiling_obj, *args, **kwargs) +def inductor(compiling_obj, example_inputs, **kwargs): + return toolchain.inductor(compiling_obj, example_inputs, **kwargs) + + def script_embedded_class(code, is_path=False): return toolchain.script_embedded_class(code, is_path) diff --git a/python/matx/contrib/cc.py b/python/matx/contrib/cc.py index bd45d695..4bed4eeb 100644 --- a/python/matx/contrib/cc.py +++ b/python/matx/contrib/cc.py @@ -93,9 +93,16 @@ def find_sys_cc_path(): raise RuntimeError("win32 is not supported") elif sys.platform.startswith('darwin'): # maybe we can use clang++ - cc_bin = "g++" + # prioritized compiler defined in CXX + if 'CXX' in os.environ: + cc_bin = os.environ['CXX'] + else: + cc_bin = "g++" else: - cc_bin = "g++" + if 'CXX' in os.environ: + cc_bin = os.environ['CXX'] + else: + cc_bin = "g++" return cc_bin diff --git a/python/matx/inductor/__init__.py b/python/matx/inductor/__init__.py new file mode 100644 index 00000000..751e5d52 --- /dev/null +++ b/python/matx/inductor/__init__.py @@ -0,0 +1,45 @@ +import inspect +from typing import List + +import torch +from torch_compiler.manual_codegen import extract_inductor_code, matx_cpp_code_format + +from matx.env import MATX_DEV_MODE +from matx.script import context +from matx.toolchain import path_prefix + + +def from_source(compiling_obj: type, example_inputs: List[torch.Tensor]) -> context.ScriptContext: + try: + + code = extract_inductor_code(compiling_obj, example_inputs) + code = matx_cpp_code_format(code) + + sc_ctx = context.ScriptContext() + sc_ctx.build_type = context.BuildType.FUNCTION + sc_ctx.main_node.raw = compiling_obj + # set sc_ctx attributes to be compatible with existing matx code + inductor_context = context.InductorContext(fn_name=compiling_obj.__name__) + sc_ctx.main_node.context = inductor_context + # set source code TODO: formatting source code + sc_ctx.main_node.span.source_code = inspect.getsource(compiling_obj) + # set filename. TODO: this is too hack + frame = inspect.stack()[3] + sc_ctx.main_node.span.file_name = frame[0].f_code.co_filename + + # export code + path = path_prefix(sc_ctx) + with open(path, 'w') as f: + f.write(code) + + # set rt_module + from .. import _ffi + build_module = _ffi.get_global_func("embedded.build.c") + sc_ctx.rt_module = build_module(code.encode()) + + return sc_ctx + except BaseException as e: + if MATX_DEV_MODE: + raise + else: + raise Exception(str(e)) from None diff --git a/python/matx/inductor/context/__init__.py b/python/matx/inductor/context/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/matx/runtime/module.py b/python/matx/runtime/module.py index e57caa28..11aa736b 100644 --- a/python/matx/runtime/module.py +++ b/python/matx/runtime/module.py @@ -210,6 +210,8 @@ def export_library(self, file_name, fcompile=None, addons=None, **kwargs): assert self.type_key == "c" + breakpoint() + modules = self._collect_dso_modules() files = addons if addons else [] is_system_lib = False diff --git a/python/matx/script/context/__init__.py b/python/matx/script/context/__init__.py index 342af971..896630ad 100644 --- a/python/matx/script/context/__init__.py +++ b/python/matx/script/context/__init__.py @@ -23,3 +23,4 @@ from .class_context import ClassContext, GetClassAttr from .function_context import FunctionContext, FunctionType from .scope_context import ScopeContext +from .inductor_context import InductorContext diff --git a/python/matx/script/context/ast_node.py b/python/matx/script/context/ast_node.py index 70ee5c57..55b2c418 100644 --- a/python/matx/script/context/ast_node.py +++ b/python/matx/script/context/ast_node.py @@ -22,6 +22,7 @@ from matx._typed_ast import ast from .class_context import ClassContext from .function_context import FunctionContext +from .inductor_context import InductorContext from ... import ir as _ir @@ -49,7 +50,7 @@ def __init__(self, ): self.raw: Optional[type] = None self.span: Span = Span() self.ast: Optional[ast.AST] = None - self.context: Union[ClassContext, FunctionContext, None] = None + self.context: Union[ClassContext, FunctionContext, InductorContext, None] = None self.module: Optional[ModuleInfo] = None self.deps: Optional[List[ASTNode]] = None self.ir_schema = None diff --git a/python/matx/script/context/inductor_context.py b/python/matx/script/context/inductor_context.py index 84bf20b4..dbdb434e 100644 --- a/python/matx/script/context/inductor_context.py +++ b/python/matx/script/context/inductor_context.py @@ -15,4 +15,14 @@ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations -# under the License. \ No newline at end of file +# under the License. + + +class InductorContext(object): + def __init__(self, + fn_name: str = '', ): + self.fn_name = fn_name + + @property + def name(self): + return self.fn_name diff --git a/python/matx/toolchain.py b/python/matx/toolchain.py index 556ba4fe..bd946627 100644 --- a/python/matx/toolchain.py +++ b/python/matx/toolchain.py @@ -40,6 +40,7 @@ USE_SO_CACHE = os.environ.get('MATX_USE_SO_CACHE', '').lower() != 'false' DISABLE_SCRIPT = os.environ.get('MATX_DISABLE_SCRIPT', '').lower() == 'true' +DISABLE_INDUCTOR = os.environ.get('MATX_DISABLE_INDUCTOR', '').lower() == 'true' DISABLE_GENERATE_CC = os.environ.get('MATX_DISABLE_GENERATE_CC', '').lower() == 'true' FLAG_COMPILED_OBJECT = object() @@ -296,7 +297,7 @@ def toolchain_build(sc_ctx: context.ScriptContext, toolchain: ToolChain): sc_ctx.dso_path = (sc_ctx.dso_path[0], so_path) -def build_dso(sc_ctx: context.ScriptContext, use_toolchain=False): +def build_dso(sc_ctx: context.ScriptContext, use_toolchain=False, compile_options=None): rt_mod = sc_ctx.rt_module main_node_name = sc_ctx.main_node.context.name base_path = path_prefix(sc_ctx) @@ -305,12 +306,16 @@ def build_dso(sc_ctx: context.ScriptContext, use_toolchain=False): sopath = base_path + '.so' sopath_cxx11 = base_path + '_cxx11.so' + # TODO: need to unify the compile options base_options = [ "-std=c++14", "-O3", "-g", "-fdiagnostics-color=always", "-Werror=return-type"] + if compile_options is not None: + assert isinstance(compile_options, List) + base_options.extend(compile_options) cxx11_with_abi_options = base_options + ["-D_GLIBCXX_USE_CXX11_ABI=1"] cxx11_no_abi_options = base_options + ["-D_GLIBCXX_USE_CXX11_ABI=0"] sys_cc_path = contrib.cc.find_sys_cc_path() @@ -380,8 +385,40 @@ def script(compiling_obj, *, share=True, toolchain=None, bundle_args=None): raise ValueError('Unsupported build_type: {}'.format(result.build_type)) -def inductor(compiling_obj, *, share=True, toolchain=None, bundle_args=None): - pass +def inductor(compiling_obj, example_inputs, *, share=True, toolchain=None, bundle_args=None): + if DISABLE_SCRIPT: + return compiling_obj + + from matx.inductor import from_source + + result: context.ScriptContext = from_source(compiling_obj, example_inputs) + + # TODO: get Pytorch additional compiler flags. Hardcode here for mvp + torch_compiler_options = [ + '-I/Users/bytedance/miniforge3/envs/inductor/lib/python3.10/site-packages/torch/include', + '-I/Users/bytedance/miniforge3/envs/inductor/lib/python3.10/site-packages/torch/include/torch/csrc/api/include', + '-I/Users/bytedance/miniforge3/envs/inductor/lib/python3.10/site-packages/torch/include/TH', + '-I/Users/bytedance/miniforge3/envs/inductor/lib/python3.10/site-packages/torch/include/THC', + '-I/Users/bytedance/miniforge3/envs/inductor/include/python3.10', + '-lgomp', + '-march=native', + '-ffast-math', + '-fno-finite-math-only', + '-fopenmp', + '-DC10_USING_CUSTOM_GENERATED_MACROS' + ] + + + build_dso(result, toolchain is not None, compile_options=torch_compiler_options) + if toolchain is not None: + toolchain_build(result, toolchain) + + if result.build_type is context.BuildType.FUNCTION: + return make_jit_op_creator(result, share, bundle_args=bundle_args)() + elif result.build_type is context.BuildType.JIT_OBJECT: + return make_jit_object_creator(result, share, bundle_args=bundle_args) + else: + raise ValueError('Unsupported build_type: {}'.format(result.build_type)) def make_session(compiling_obj, method='__call__'): From 9bf0c3198d3a51668f1c55b3975e92d944887824 Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Wed, 18 Jan 2023 11:59:35 +0800 Subject: [PATCH 03/21] matx.inductor mvp example works --- python/matx/inductor/__init__.py | 7 +++++++ python/matx/runtime/module.py | 2 -- python/matx/script/context/inductor_context.py | 3 +++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/python/matx/inductor/__init__.py b/python/matx/inductor/__init__.py index 751e5d52..ac50930a 100644 --- a/python/matx/inductor/__init__.py +++ b/python/matx/inductor/__init__.py @@ -37,6 +37,13 @@ def from_source(compiling_obj: type, example_inputs: List[torch.Tensor]) -> cont build_module = _ffi.get_global_func("embedded.build.c") sc_ctx.rt_module = build_module(code.encode()) + # set args types. # TODO: hardcode for now + from .. import ir + sc_ctx.main_node.context.arg_types = dict( + a=ir.type.NDArrayType(), + b=ir.type.NDArrayType() + ) + return sc_ctx except BaseException as e: if MATX_DEV_MODE: diff --git a/python/matx/runtime/module.py b/python/matx/runtime/module.py index 11aa736b..e57caa28 100644 --- a/python/matx/runtime/module.py +++ b/python/matx/runtime/module.py @@ -210,8 +210,6 @@ def export_library(self, file_name, fcompile=None, addons=None, **kwargs): assert self.type_key == "c" - breakpoint() - modules = self._collect_dso_modules() files = addons if addons else [] is_system_lib = False diff --git a/python/matx/script/context/inductor_context.py b/python/matx/script/context/inductor_context.py index dbdb434e..ec775a1f 100644 --- a/python/matx/script/context/inductor_context.py +++ b/python/matx/script/context/inductor_context.py @@ -22,6 +22,9 @@ class InductorContext(object): def __init__(self, fn_name: str = '', ): self.fn_name = fn_name + self.unbound_name = fn_name + self.return_type = None + self.arg_types = {} # Deferred? @property def name(self): From 3439aea6c2975b1c67f9bf06b9b3bdfd5317b0af Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Wed, 18 Jan 2023 12:08:31 +0800 Subject: [PATCH 04/21] add inductor demo --- examples/inductor/simple_inductor.py | 66 ++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 examples/inductor/simple_inductor.py diff --git a/examples/inductor/simple_inductor.py b/examples/inductor/simple_inductor.py new file mode 100644 index 00000000..a4ef44ff --- /dev/null +++ b/examples/inductor/simple_inductor.py @@ -0,0 +1,66 @@ +import matx +import torch +import numpy as np +import json + + +def kernel(a: matx.NDArray, b: matx.NDArray): + c = a + b + c = torch.nn.functional.relu(c) + return c, + + +add_kernel = matx.inductor(kernel, example_inputs=[ + torch.randn(5), + torch.randn(5) +]) + + +@matx.script +def add_json(a: str, b: str) -> str: + """ + Assume a and b is a json containing 10 digits. We would like to add them and return another json + """ + a_list = json.loads(a) + b_list = json.loads(b) + + a_tensor = matx.NDArray(arr=a_list, shape=[5], dtype='float32') + b_tensor = matx.NDArray(arr=b_list, shape=[5], dtype='float32') + c_tensor = matx.NDArray(arr=b_list, shape=[5], dtype='float32') + + add_kernel(a_tensor, b_tensor, c_tensor) + + result_lst = c_tensor.tolist() + + return json.dumps(result_lst) + + +if __name__ == '__main__': + print(f'Pytorch version {torch.__version__}') + + a_np = np.random.randn(5).astype(np.float32) + b_np = np.random.randn(5).astype(np.float32) + c_np = np.random.randn(5).astype(np.float32) + + a = matx.NDArray([], a_np.shape, str(a_np.dtype)) + a.from_numpy(a_np) + + b = matx.NDArray([], b_np.shape, str(b_np.dtype)) + b.from_numpy(b_np) + + c = matx.NDArray([], c_np.shape, str(c_np.dtype)) + c.from_numpy(c_np) + + print(a) + print(b) + print(c) + + print(kernel(a.torch(), b.torch())) + + d = add_kernel(a, b, c) + print(c) + + a = json.dumps([1, 2, 3, 4, 5]) + b = json.dumps([6, 7, 8, 9, 10]) + result = add_json(a, b) + print(result) From 0dc8280348d258e08cca0acabf307d72c0684c0c Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Tue, 31 Jan 2023 15:22:44 +0800 Subject: [PATCH 05/21] move torch_compiler inside matx. Currently, depends on Pytorch 2.0 --- python/matx/__init__.py | 16 +- python/matx/inductor/__init__.py | 2 +- python/matx/torch_compiler/__init__.py | 13 + python/matx/torch_compiler/codegen.py | 261 ++++++++++++++++++ .../torch_compiler/tests}/simple_inductor.py | 36 +-- python/matx/torch_compiler/utils/__init__.py | 0 python/matx/torch_compiler/utils/cpp_parse.py | 141 ++++++++++ 7 files changed, 434 insertions(+), 35 deletions(-) create mode 100644 python/matx/torch_compiler/__init__.py create mode 100644 python/matx/torch_compiler/codegen.py rename {examples/inductor => python/matx/torch_compiler/tests}/simple_inductor.py (56%) create mode 100644 python/matx/torch_compiler/utils/__init__.py create mode 100644 python/matx/torch_compiler/utils/cpp_parse.py diff --git a/python/matx/__init__.py b/python/matx/__init__.py index 5c84b691..78cb0d35 100644 --- a/python/matx/__init__.py +++ b/python/matx/__init__.py @@ -351,8 +351,20 @@ def script(compiling_obj, *args, backend=None, **kwargs): return toolchain.script(compiling_obj, *args, **kwargs) -def inductor(compiling_obj, example_inputs, **kwargs): - return toolchain.inductor(compiling_obj, example_inputs, **kwargs) +def inductor(example_inputs, **kwargs): + """ + + Args: + example_inputs: any nested structure of torch.Tensor that passed into the kernel + **kwargs: other keyword arguments passed into toolchain.inductor + + Returns: a wrapper that compiles the compiling_obj into a JIT FUNCTION + + """ + def inner_inductor(compiling_obj): + return toolchain.inductor(compiling_obj, example_inputs, **kwargs) + + return inner_inductor def script_embedded_class(code, is_path=False): diff --git a/python/matx/inductor/__init__.py b/python/matx/inductor/__init__.py index ac50930a..b3a61d84 100644 --- a/python/matx/inductor/__init__.py +++ b/python/matx/inductor/__init__.py @@ -2,7 +2,7 @@ from typing import List import torch -from torch_compiler.manual_codegen import extract_inductor_code, matx_cpp_code_format +from matx.torch_compiler.codegen import extract_inductor_code, matx_cpp_code_format from matx.env import MATX_DEV_MODE from matx.script import context diff --git a/python/matx/torch_compiler/__init__.py b/python/matx/torch_compiler/__init__.py new file mode 100644 index 00000000..e83ac962 --- /dev/null +++ b/python/matx/torch_compiler/__init__.py @@ -0,0 +1,13 @@ +minimum_torch_version = '2.0.0a0' + +try: + import torch + + assert torch.__version__ >= minimum_torch_version + +except ModuleNotFoundError: + print(f'torch is not installed. matx.inductor requires torch >= {minimum_torch_version}') + raise +except AssertionError: + print(f'matx.inductor requires torch >= {minimum_torch_version}') + raise diff --git a/python/matx/torch_compiler/codegen.py b/python/matx/torch_compiler/codegen.py new file mode 100644 index 00000000..4d2566c9 --- /dev/null +++ b/python/matx/torch_compiler/codegen.py @@ -0,0 +1,261 @@ +import copy +import logging +from typing import List + +import torch +import torch._inductor.compile_fx as compile_fx +from torch import fx +from torch._inductor.debug import DebugContext +from torch._inductor.virtualized import V + +from .utils import cpp_parse + +log = logging.getLogger(__name__) + +MATX_INCLUDE = ''' +#include "matxscript/runtime/codegen_all_includes.h" +#include + +using namespace ::matxscript::runtime; +extern "C" void* __matxscript_module_ctx = NULL; + +extern "C" MATX_DLL MATXScriptFuncRegistry __matxscript_func_registry__; +''' + +SESSION_HANLDER = cpp_parse.CPPArg(name='handle_2_71828182846', + type=cpp_parse.CPPType(name='void', is_pointer=True)) +SESSION_HANLDER_WITH_DEAFULT = cpp_parse.CPPArg(name='handle_2_71828182846', + type=cpp_parse.CPPType(name='void', is_pointer=True), + default_val='((void*)(int64_t)0)') + + +def generate_ndarray_arg_cast(arg_name, arg_index, dtype, message='TODO'): + return f'({dtype}*)internal::TypeAsHelper::run(({arg_name}[{arg_index}]), __FILE__, __LINE__, "{message}", "{message}").Data<{dtype}>()' + + +def get_c_api(kernel_name: str, args: List[cpp_parse.CPPArg], has_return_value) -> str: + template_with_return = ''' +int kernel__c_api(MATXScriptAny* args, int num_args, MATXScriptAny* out_ret_value, void* resource_handle = nullptr) +{{ + TArgs args_t(args, num_args); + + if (num_args > 0 && args[num_args - 1].code == TypeIndex::kRuntimeKwargs) {{ + string_view arg_names[{}] {{{}}}; + KwargsUnpackHelper helper("{}", arg_names, {}, nullptr, 0); + RTView pos_args[{}]; + helper.unpack(pos_args, args, num_args); // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5 + + auto ret = {}({}, + {}resource_handle); + RTValue(std::move(ret)).MoveToCHost(out_ret_value); + }} else {{ + switch(num_args) {{ + case {}: {{ + auto ret = {}({}, + {}resource_handle); // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5 + RTValue(std::move(ret)).MoveToCHost(out_ret_value); + }} break; + default: {{THROW_PY_TypeError("TODO");}} break; // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5 + }} + }} + + return 0; +}} +''' + template_without_return = ''' +int kernel__c_api(MATXScriptAny* args, int num_args, MATXScriptAny* out_ret_value, void* resource_handle = nullptr) +{{ + TArgs args_t(args, num_args); + + if (num_args > 0 && args[num_args - 1].code == TypeIndex::kRuntimeKwargs) {{ + string_view arg_names[{}] {{{}}}; + KwargsUnpackHelper helper("{}", arg_names, {}, nullptr, 0); + RTView pos_args[{}]; + helper.unpack(pos_args, args, num_args); // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5 + + {}({}, + {}resource_handle); + }} else {{ + switch(num_args) {{ + case {}: {{ + {}({}, + {}resource_handle); // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5 + int ret = 1; + RTValue(std::move(ret)).MoveToCHost(out_ret_value); + }} break; + default: {{THROW_PY_TypeError("TODO");}} break; // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5 + }} + }} + + return 0; +}} +''' + if has_return_value: + template = template_with_return + else: + template = template_without_return + + num_args = len(args) + arg_names_concat_str = ', '.join([f'"{arg.name}"' for arg in args]) + args_dtype = [arg.type.name for arg in args] + + pos_arg_cast_lst = [] + args_t_cast_lst = [] + for arg_index in range(num_args): + pos_arg_cast_lst.append(generate_ndarray_arg_cast('pos_args', arg_index, args_dtype[arg_index])) + args_t_cast_lst.append(generate_ndarray_arg_cast('args_t', arg_index, args_dtype[arg_index])) + + kernel_name_indentation = len(kernel_name) * ' ' + if has_return_value: + return_name_indentation = ' ' * 11 + else: + return_name_indentation = '' + pos_arg_cast_indentation = '\n ' + kernel_name_indentation + return_name_indentation + args_t_cast_indentation = '\n ' + kernel_name_indentation + return_name_indentation + pos_arg_cast = (',' + pos_arg_cast_indentation).join(pos_arg_cast_lst) + args_t_cast = (',' + args_t_cast_indentation).join(args_t_cast_lst) + + return template.format(num_args, arg_names_concat_str, kernel_name, num_args, num_args, kernel_name, + pos_arg_cast, kernel_name_indentation, num_args, kernel_name, + args_t_cast, kernel_name_indentation) + + +def get_registration_str(kernel_name): + # TODO: currently, only 1 function is here. + template = ''' +extern "C" {{ + +MATX_DLL MATXScriptBackendPackedCFunc __matxscript_func_array__[] = {{ + (MATXScriptBackendPackedCFunc){}__c_api, +}}; +MATX_DLL MATXScriptFuncRegistry __matxscript_func_registry__ = {{ + "1\\000{}\\000", __matxscript_func_array__, +}}; + +}} // extern C + +extern "C" {{ + +MATX_DLL const char* __matxscript_closures_names__ = "1\\000{}\\000"; + +}} // extern C + + ''' + return template.format(kernel_name, kernel_name, kernel_name) + + +def get_c_api_declare(kernel_name): + return f'int {kernel_name}__c_api(MATXScriptAny*, int, MATXScriptAny*, void*);' + + +def extract_cpp_code(code: str): + return code.split("'''")[1][1:-1] + + +def matx_cpp_code_format(code: str) -> str: + code = extract_cpp_code(code) + # split include and kernel code + first_newline_idx = code.find('\n') + include_code_str = code[:first_newline_idx] + kernel_code_str = code[first_newline_idx + 1:] + + # add matx include + include_code_str += MATX_INCLUDE + + # extract kernel declaration + first_open_bracket = kernel_code_str.find('{') + kernel_declaration_str = kernel_code_str[:first_open_bracket] + kernel_body_str = kernel_code_str[first_open_bracket:] + + kernel_declaration = cpp_parse.parse_cpp_declaration(kernel_declaration_str) + + kernel_declaration_without_default = copy.deepcopy(kernel_declaration) + kernel_declaration_without_default.append_arg(SESSION_HANLDER) + kernel_declaration_with_default = copy.deepcopy(kernel_declaration) + kernel_declaration_with_default.append_arg(SESSION_HANLDER_WITH_DEAFULT) + + # add kernel declaration and c-api + function_declaration_str = str(kernel_declaration_with_default) + ';' + '\n\n' + \ + get_c_api_declare(kernel_declaration_with_default.func_name) + '\n' + + # add kernel + kernel_impl_str = str(kernel_declaration_without_default) + '\n' + kernel_body_str + + # add kernel c-api + + kernel_c_api_impl_str = get_c_api(kernel_name=kernel_declaration.func_name, + args=kernel_declaration.args, + has_return_value=kernel_declaration.return_type.name != 'void') + + # add namespace + kernel_code_str = ['namespace {', function_declaration_str, kernel_impl_str, + kernel_c_api_impl_str, '} // namespace'] + kernel_code_str = '\n\n'.join(kernel_code_str) + + # registration str + registration_code_str = get_registration_str(kernel_name=kernel_declaration.func_name) + + # final code + final_code = [include_code_str, kernel_code_str, registration_code_str] + + final_code = '\n\n'.join(final_code) + + return final_code + + +""" +Use a global variable to hack the compile_fx_inner and record the compiled code. +This works in single process problem, but requires careful review in multi-processing +""" + + +class FakeCallableWithCode(): + code = None + + def __call__(self, *args, **kwargs): + raise NotImplementedError + + def set_code(self, code): + self.code = code + + +fake_callable = FakeCallableWithCode() + + +@DebugContext.wrap +@torch.utils._python_dispatch._disable_current_modes() +def compile_fx_inner_cpu( + gm: torch.fx.GraphModule, + example_inputs: List[torch.Tensor], + cudagraphs=None, + num_fixed=0, + is_backward=False, + graph_id=None, +): + # lift the maximum depth of the Python interpreter stack + # to adapt large/deep models + compile_fx.sys.setrecursionlimit(max(compile_fx.sys.getrecursionlimit(), 2000)) + V.debug.fx_graph(gm, example_inputs) + shape_env = compile_fx._shape_env_from_inputs(example_inputs) + fake_mode = compile_fx.fake_mode_from_tensors(example_inputs) + graph = compile_fx.GraphLowering( + gm, + shape_env=shape_env, + num_static_inputs=num_fixed, + graph_id=graph_id, + fake_mode=fake_mode, + ) + with V.set_graph_handler(graph): + graph.run(*example_inputs) + code = graph.codegen() + fake_callable.set_code(code) + + return fake_callable + + +def extract_inductor_code(kernel, example_inputs): + model = fx.symbolic_trace(kernel) + compile_fx.compile_fx(model, example_inputs_=example_inputs, inner_compile=compile_fx_inner_cpu) + + code = fake_callable.code + return code diff --git a/examples/inductor/simple_inductor.py b/python/matx/torch_compiler/tests/simple_inductor.py similarity index 56% rename from examples/inductor/simple_inductor.py rename to python/matx/torch_compiler/tests/simple_inductor.py index a4ef44ff..cee50ce5 100644 --- a/examples/inductor/simple_inductor.py +++ b/python/matx/torch_compiler/tests/simple_inductor.py @@ -1,21 +1,16 @@ +import json + import matx import torch -import numpy as np -import json +@matx.inductor(example_inputs=[torch.randn(5), torch.randn(5)]) def kernel(a: matx.NDArray, b: matx.NDArray): c = a + b c = torch.nn.functional.relu(c) return c, -add_kernel = matx.inductor(kernel, example_inputs=[ - torch.randn(5), - torch.randn(5) -]) - - @matx.script def add_json(a: str, b: str) -> str: """ @@ -28,7 +23,7 @@ def add_json(a: str, b: str) -> str: b_tensor = matx.NDArray(arr=b_list, shape=[5], dtype='float32') c_tensor = matx.NDArray(arr=b_list, shape=[5], dtype='float32') - add_kernel(a_tensor, b_tensor, c_tensor) + kernel(a_tensor, b_tensor, c_tensor) result_lst = c_tensor.tolist() @@ -37,29 +32,6 @@ def add_json(a: str, b: str) -> str: if __name__ == '__main__': print(f'Pytorch version {torch.__version__}') - - a_np = np.random.randn(5).astype(np.float32) - b_np = np.random.randn(5).astype(np.float32) - c_np = np.random.randn(5).astype(np.float32) - - a = matx.NDArray([], a_np.shape, str(a_np.dtype)) - a.from_numpy(a_np) - - b = matx.NDArray([], b_np.shape, str(b_np.dtype)) - b.from_numpy(b_np) - - c = matx.NDArray([], c_np.shape, str(c_np.dtype)) - c.from_numpy(c_np) - - print(a) - print(b) - print(c) - - print(kernel(a.torch(), b.torch())) - - d = add_kernel(a, b, c) - print(c) - a = json.dumps([1, 2, 3, 4, 5]) b = json.dumps([6, 7, 8, 9, 10]) result = add_json(a, b) diff --git a/python/matx/torch_compiler/utils/__init__.py b/python/matx/torch_compiler/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/matx/torch_compiler/utils/cpp_parse.py b/python/matx/torch_compiler/utils/cpp_parse.py new file mode 100644 index 00000000..e0d9611d --- /dev/null +++ b/python/matx/torch_compiler/utils/cpp_parse.py @@ -0,0 +1,141 @@ +import dataclasses +from typing import List, Union + + +@dataclasses.dataclass +class CPPType(object): + name: str = None + is_pointer: bool = False + + def __str__(self): + result = self.name + if self.is_pointer: + result += '*' + + return result + + +@dataclasses.dataclass +class CPPArg(object): + name: str = None + type: CPPType = CPPType() + is_const: bool = False + is_restricted: bool = False + default_val: Union[str, None] = None + + def __str__(self): + result = [] + if self.is_const: + result.append('const') + result.append(str(self.type)) + if self.is_restricted: + result.append('__restrict__') + result.append(self.name) + + if self.default_val is not None: + result.append(f'= {self.default_val}') + + return ' '.join(result) + + +def parse_cpp_arg(cpp_arg_str: str) -> CPPArg: + """Parse the C++ arg from a string such as const float* __restrict__ a = null_ptr + + :param cpp_arg_str: the string of the argument + :return: a CPPArg dataclass + """ + + cpp_arg = CPPArg() + + # find if there is a default value + if '=' in cpp_arg_str: + cpp_arg_str, default_val = cpp_arg_str.split('=') + default_val = default_val.replace(' ', '') + cpp_arg.default_val = default_val + + word = cpp_arg_str.split() + + cpp_arg.name = word[-1] + + for w in word[:-1]: + if w == 'const': + cpp_arg.is_const = True + elif w == '*': + cpp_arg.type.is_pointer = True + elif w == '__restrict__': + cpp_arg.is_restricted = True + else: + # type + if w[-1] == '*': + cpp_arg.type.is_pointer = True + w = w[:-1] # remove * + cpp_arg.type.name = w + + return cpp_arg + + +@dataclasses.dataclass +class CPPDeclaration(object): + func_name: str = None + return_type: CPPType = CPPType() + args: List[CPPArg] = dataclasses.field(default_factory=list) + is_extern_c: bool = False + + def append_arg(self, arg: CPPArg): + self.args.append(arg) + + def __str__(self): + result = [] + if self.is_extern_c: + result.append('extern "C"') + result.append(str(self.return_type)) + result.append(self.func_name) + + front = ' '.join(result) + num_spaces = len(front) + 1 + interval = ',\n' + ' ' * num_spaces + + args_str = interval.join([str(arg) for arg in self.args]) + + return front + '(' + args_str + ')' + + +def parse_cpp_declaration(cpp_declaration_str: str) -> CPPDeclaration: + """Parse the CPP declaration in string and return a CPPDeclaration. + + :param cpp_declaration_str: + :return: + """ + cpp_declaration = CPPDeclaration() + + identifier_return_name, cpp_arg_str = cpp_declaration_str.split('(') + cpp_arg_str = cpp_arg_str.split(')')[0] + cpp_arg_str_lst = cpp_arg_str.split(',') + # arguments + for cpp_arg_str in cpp_arg_str_lst: + cpp_declaration.args.append(parse_cpp_arg(cpp_arg_str)) + + # process return type and function name + identifier_return_name_lst = identifier_return_name.split() + if identifier_return_name_lst[0] == 'extern' and identifier_return_name_lst[1] == '"C"': + cpp_declaration.is_extern_c = True + identifier_return_name_lst = identifier_return_name_lst[2:] + + cpp_declaration.func_name = identifier_return_name_lst[-1] + # remove func_name + return_type_str_lst = identifier_return_name_lst[:-1] + + if len(return_type_str_lst) == 1: + return_type_str = return_type_str_lst[0] + if return_type_str[-1] == '*': + cpp_declaration.return_type.name = return_type_str[:-1] + cpp_declaration.return_type.is_pointer = True + else: + cpp_declaration.return_type.name = return_type_str + else: + assert len(return_type_str_lst) == 2 + assert return_type_str_lst[-1] == '*' + cpp_declaration.return_type.name = return_type_str_lst[0] + cpp_declaration.return_type.is_pointer = True + + return cpp_declaration From 3b7d7373ce26ddc9da884ff59b4768ef84e020f7 Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Tue, 31 Jan 2023 15:41:22 +0800 Subject: [PATCH 06/21] [matx.inductor] fix kernel name --- python/matx/inductor/__init__.py | 4 ++-- python/matx/torch_compiler/codegen.py | 18 +++++++++++++----- .../torch_compiler/tests/simple_inductor.py | 4 ++-- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/python/matx/inductor/__init__.py b/python/matx/inductor/__init__.py index b3a61d84..88e87920 100644 --- a/python/matx/inductor/__init__.py +++ b/python/matx/inductor/__init__.py @@ -12,8 +12,8 @@ def from_source(compiling_obj: type, example_inputs: List[torch.Tensor]) -> context.ScriptContext: try: - code = extract_inductor_code(compiling_obj, example_inputs) - code = matx_cpp_code_format(code) + code, kernel_name = extract_inductor_code(compiling_obj, example_inputs) + code = matx_cpp_code_format(code, kernel_name) sc_ctx = context.ScriptContext() sc_ctx.build_type = context.BuildType.FUNCTION diff --git a/python/matx/torch_compiler/codegen.py b/python/matx/torch_compiler/codegen.py index 4d2566c9..2d84345c 100644 --- a/python/matx/torch_compiler/codegen.py +++ b/python/matx/torch_compiler/codegen.py @@ -35,7 +35,7 @@ def generate_ndarray_arg_cast(arg_name, arg_index, dtype, message='TODO'): def get_c_api(kernel_name: str, args: List[cpp_parse.CPPArg], has_return_value) -> str: template_with_return = ''' -int kernel__c_api(MATXScriptAny* args, int num_args, MATXScriptAny* out_ret_value, void* resource_handle = nullptr) +int {}__c_api(MATXScriptAny* args, int num_args, MATXScriptAny* out_ret_value, void* resource_handle = nullptr) {{ TArgs args_t(args, num_args); @@ -63,7 +63,7 @@ def get_c_api(kernel_name: str, args: List[cpp_parse.CPPArg], has_return_value) }} ''' template_without_return = ''' -int kernel__c_api(MATXScriptAny* args, int num_args, MATXScriptAny* out_ret_value, void* resource_handle = nullptr) +int {}__c_api(MATXScriptAny* args, int num_args, MATXScriptAny* out_ret_value, void* resource_handle = nullptr) {{ TArgs args_t(args, num_args); @@ -115,7 +115,7 @@ def get_c_api(kernel_name: str, args: List[cpp_parse.CPPArg], has_return_value) pos_arg_cast = (',' + pos_arg_cast_indentation).join(pos_arg_cast_lst) args_t_cast = (',' + args_t_cast_indentation).join(args_t_cast_lst) - return template.format(num_args, arg_names_concat_str, kernel_name, num_args, num_args, kernel_name, + return template.format(kernel_name, num_args, arg_names_concat_str, kernel_name, num_args, num_args, kernel_name, pos_arg_cast, kernel_name_indentation, num_args, kernel_name, args_t_cast, kernel_name_indentation) @@ -152,7 +152,7 @@ def extract_cpp_code(code: str): return code.split("'''")[1][1:-1] -def matx_cpp_code_format(code: str) -> str: +def matx_cpp_code_format(code: str, kernel_name: str) -> str: code = extract_cpp_code(code) # split include and kernel code first_newline_idx = code.find('\n') @@ -168,6 +168,8 @@ def matx_cpp_code_format(code: str) -> str: kernel_body_str = kernel_code_str[first_open_bracket:] kernel_declaration = cpp_parse.parse_cpp_declaration(kernel_declaration_str) + # TODO: remove this hack after port to C++ codegen + kernel_declaration.func_name = kernel_name kernel_declaration_without_default = copy.deepcopy(kernel_declaration) kernel_declaration_without_default.append_arg(SESSION_HANLDER) @@ -258,4 +260,10 @@ def extract_inductor_code(kernel, example_inputs): compile_fx.compile_fx(model, example_inputs_=example_inputs, inner_compile=compile_fx_inner_cpu) code = fake_callable.code - return code + + # By default, Pytorch compiles a Python module with all the C++ kernel with unified name kernel. + # The actual kernel name should be kernel.__name__. + # TODO: fix this after rewriting inductor codegen to all C++ instead of a Python module + kernel_name = kernel.__name__ + + return code, kernel_name diff --git a/python/matx/torch_compiler/tests/simple_inductor.py b/python/matx/torch_compiler/tests/simple_inductor.py index cee50ce5..a48ed2cf 100644 --- a/python/matx/torch_compiler/tests/simple_inductor.py +++ b/python/matx/torch_compiler/tests/simple_inductor.py @@ -5,7 +5,7 @@ @matx.inductor(example_inputs=[torch.randn(5), torch.randn(5)]) -def kernel(a: matx.NDArray, b: matx.NDArray): +def add_relu(a: matx.NDArray, b: matx.NDArray): c = a + b c = torch.nn.functional.relu(c) return c, @@ -23,7 +23,7 @@ def add_json(a: str, b: str) -> str: b_tensor = matx.NDArray(arr=b_list, shape=[5], dtype='float32') c_tensor = matx.NDArray(arr=b_list, shape=[5], dtype='float32') - kernel(a_tensor, b_tensor, c_tensor) + add_relu(a_tensor, b_tensor, c_tensor) result_lst = c_tensor.tolist() From 79de762ddf9085bfe1780d72e6e5e76e4f61c5de Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Wed, 1 Feb 2023 15:29:11 +0800 Subject: [PATCH 07/21] update matx formatter to match the raw function signature --- cpp_playground/main.cpp | 3 + python/matx/torch_compiler/codegen.py | 269 --------------- .../{utils => codegen}/__init__.py | 0 .../codegen/inductor/__init__.py | 0 .../torch_compiler/codegen/matx_formatter.py | 321 ++++++++++++++++++ .../torch_compiler/codegen/utils/__init__.py | 0 .../{ => codegen}/utils/cpp_parse.py | 0 .../torch_compiler/tests/nested_inputs.py | 0 test/inductor/test_basic.py | 0 9 files changed, 324 insertions(+), 269 deletions(-) create mode 100644 cpp_playground/main.cpp delete mode 100644 python/matx/torch_compiler/codegen.py rename python/matx/torch_compiler/{utils => codegen}/__init__.py (100%) create mode 100644 python/matx/torch_compiler/codegen/inductor/__init__.py create mode 100644 python/matx/torch_compiler/codegen/matx_formatter.py create mode 100644 python/matx/torch_compiler/codegen/utils/__init__.py rename python/matx/torch_compiler/{ => codegen}/utils/cpp_parse.py (100%) create mode 100644 python/matx/torch_compiler/tests/nested_inputs.py create mode 100644 test/inductor/test_basic.py diff --git a/cpp_playground/main.cpp b/cpp_playground/main.cpp new file mode 100644 index 00000000..c302b43a --- /dev/null +++ b/cpp_playground/main.cpp @@ -0,0 +1,3 @@ +// +// Created by ByteDance on 2023/2/1. +// diff --git a/python/matx/torch_compiler/codegen.py b/python/matx/torch_compiler/codegen.py deleted file mode 100644 index 2d84345c..00000000 --- a/python/matx/torch_compiler/codegen.py +++ /dev/null @@ -1,269 +0,0 @@ -import copy -import logging -from typing import List - -import torch -import torch._inductor.compile_fx as compile_fx -from torch import fx -from torch._inductor.debug import DebugContext -from torch._inductor.virtualized import V - -from .utils import cpp_parse - -log = logging.getLogger(__name__) - -MATX_INCLUDE = ''' -#include "matxscript/runtime/codegen_all_includes.h" -#include - -using namespace ::matxscript::runtime; -extern "C" void* __matxscript_module_ctx = NULL; - -extern "C" MATX_DLL MATXScriptFuncRegistry __matxscript_func_registry__; -''' - -SESSION_HANLDER = cpp_parse.CPPArg(name='handle_2_71828182846', - type=cpp_parse.CPPType(name='void', is_pointer=True)) -SESSION_HANLDER_WITH_DEAFULT = cpp_parse.CPPArg(name='handle_2_71828182846', - type=cpp_parse.CPPType(name='void', is_pointer=True), - default_val='((void*)(int64_t)0)') - - -def generate_ndarray_arg_cast(arg_name, arg_index, dtype, message='TODO'): - return f'({dtype}*)internal::TypeAsHelper::run(({arg_name}[{arg_index}]), __FILE__, __LINE__, "{message}", "{message}").Data<{dtype}>()' - - -def get_c_api(kernel_name: str, args: List[cpp_parse.CPPArg], has_return_value) -> str: - template_with_return = ''' -int {}__c_api(MATXScriptAny* args, int num_args, MATXScriptAny* out_ret_value, void* resource_handle = nullptr) -{{ - TArgs args_t(args, num_args); - - if (num_args > 0 && args[num_args - 1].code == TypeIndex::kRuntimeKwargs) {{ - string_view arg_names[{}] {{{}}}; - KwargsUnpackHelper helper("{}", arg_names, {}, nullptr, 0); - RTView pos_args[{}]; - helper.unpack(pos_args, args, num_args); // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5 - - auto ret = {}({}, - {}resource_handle); - RTValue(std::move(ret)).MoveToCHost(out_ret_value); - }} else {{ - switch(num_args) {{ - case {}: {{ - auto ret = {}({}, - {}resource_handle); // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5 - RTValue(std::move(ret)).MoveToCHost(out_ret_value); - }} break; - default: {{THROW_PY_TypeError("TODO");}} break; // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5 - }} - }} - - return 0; -}} -''' - template_without_return = ''' -int {}__c_api(MATXScriptAny* args, int num_args, MATXScriptAny* out_ret_value, void* resource_handle = nullptr) -{{ - TArgs args_t(args, num_args); - - if (num_args > 0 && args[num_args - 1].code == TypeIndex::kRuntimeKwargs) {{ - string_view arg_names[{}] {{{}}}; - KwargsUnpackHelper helper("{}", arg_names, {}, nullptr, 0); - RTView pos_args[{}]; - helper.unpack(pos_args, args, num_args); // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5 - - {}({}, - {}resource_handle); - }} else {{ - switch(num_args) {{ - case {}: {{ - {}({}, - {}resource_handle); // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5 - int ret = 1; - RTValue(std::move(ret)).MoveToCHost(out_ret_value); - }} break; - default: {{THROW_PY_TypeError("TODO");}} break; // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5 - }} - }} - - return 0; -}} -''' - if has_return_value: - template = template_with_return - else: - template = template_without_return - - num_args = len(args) - arg_names_concat_str = ', '.join([f'"{arg.name}"' for arg in args]) - args_dtype = [arg.type.name for arg in args] - - pos_arg_cast_lst = [] - args_t_cast_lst = [] - for arg_index in range(num_args): - pos_arg_cast_lst.append(generate_ndarray_arg_cast('pos_args', arg_index, args_dtype[arg_index])) - args_t_cast_lst.append(generate_ndarray_arg_cast('args_t', arg_index, args_dtype[arg_index])) - - kernel_name_indentation = len(kernel_name) * ' ' - if has_return_value: - return_name_indentation = ' ' * 11 - else: - return_name_indentation = '' - pos_arg_cast_indentation = '\n ' + kernel_name_indentation + return_name_indentation - args_t_cast_indentation = '\n ' + kernel_name_indentation + return_name_indentation - pos_arg_cast = (',' + pos_arg_cast_indentation).join(pos_arg_cast_lst) - args_t_cast = (',' + args_t_cast_indentation).join(args_t_cast_lst) - - return template.format(kernel_name, num_args, arg_names_concat_str, kernel_name, num_args, num_args, kernel_name, - pos_arg_cast, kernel_name_indentation, num_args, kernel_name, - args_t_cast, kernel_name_indentation) - - -def get_registration_str(kernel_name): - # TODO: currently, only 1 function is here. - template = ''' -extern "C" {{ - -MATX_DLL MATXScriptBackendPackedCFunc __matxscript_func_array__[] = {{ - (MATXScriptBackendPackedCFunc){}__c_api, -}}; -MATX_DLL MATXScriptFuncRegistry __matxscript_func_registry__ = {{ - "1\\000{}\\000", __matxscript_func_array__, -}}; - -}} // extern C - -extern "C" {{ - -MATX_DLL const char* __matxscript_closures_names__ = "1\\000{}\\000"; - -}} // extern C - - ''' - return template.format(kernel_name, kernel_name, kernel_name) - - -def get_c_api_declare(kernel_name): - return f'int {kernel_name}__c_api(MATXScriptAny*, int, MATXScriptAny*, void*);' - - -def extract_cpp_code(code: str): - return code.split("'''")[1][1:-1] - - -def matx_cpp_code_format(code: str, kernel_name: str) -> str: - code = extract_cpp_code(code) - # split include and kernel code - first_newline_idx = code.find('\n') - include_code_str = code[:first_newline_idx] - kernel_code_str = code[first_newline_idx + 1:] - - # add matx include - include_code_str += MATX_INCLUDE - - # extract kernel declaration - first_open_bracket = kernel_code_str.find('{') - kernel_declaration_str = kernel_code_str[:first_open_bracket] - kernel_body_str = kernel_code_str[first_open_bracket:] - - kernel_declaration = cpp_parse.parse_cpp_declaration(kernel_declaration_str) - # TODO: remove this hack after port to C++ codegen - kernel_declaration.func_name = kernel_name - - kernel_declaration_without_default = copy.deepcopy(kernel_declaration) - kernel_declaration_without_default.append_arg(SESSION_HANLDER) - kernel_declaration_with_default = copy.deepcopy(kernel_declaration) - kernel_declaration_with_default.append_arg(SESSION_HANLDER_WITH_DEAFULT) - - # add kernel declaration and c-api - function_declaration_str = str(kernel_declaration_with_default) + ';' + '\n\n' + \ - get_c_api_declare(kernel_declaration_with_default.func_name) + '\n' - - # add kernel - kernel_impl_str = str(kernel_declaration_without_default) + '\n' + kernel_body_str - - # add kernel c-api - - kernel_c_api_impl_str = get_c_api(kernel_name=kernel_declaration.func_name, - args=kernel_declaration.args, - has_return_value=kernel_declaration.return_type.name != 'void') - - # add namespace - kernel_code_str = ['namespace {', function_declaration_str, kernel_impl_str, - kernel_c_api_impl_str, '} // namespace'] - kernel_code_str = '\n\n'.join(kernel_code_str) - - # registration str - registration_code_str = get_registration_str(kernel_name=kernel_declaration.func_name) - - # final code - final_code = [include_code_str, kernel_code_str, registration_code_str] - - final_code = '\n\n'.join(final_code) - - return final_code - - -""" -Use a global variable to hack the compile_fx_inner and record the compiled code. -This works in single process problem, but requires careful review in multi-processing -""" - - -class FakeCallableWithCode(): - code = None - - def __call__(self, *args, **kwargs): - raise NotImplementedError - - def set_code(self, code): - self.code = code - - -fake_callable = FakeCallableWithCode() - - -@DebugContext.wrap -@torch.utils._python_dispatch._disable_current_modes() -def compile_fx_inner_cpu( - gm: torch.fx.GraphModule, - example_inputs: List[torch.Tensor], - cudagraphs=None, - num_fixed=0, - is_backward=False, - graph_id=None, -): - # lift the maximum depth of the Python interpreter stack - # to adapt large/deep models - compile_fx.sys.setrecursionlimit(max(compile_fx.sys.getrecursionlimit(), 2000)) - V.debug.fx_graph(gm, example_inputs) - shape_env = compile_fx._shape_env_from_inputs(example_inputs) - fake_mode = compile_fx.fake_mode_from_tensors(example_inputs) - graph = compile_fx.GraphLowering( - gm, - shape_env=shape_env, - num_static_inputs=num_fixed, - graph_id=graph_id, - fake_mode=fake_mode, - ) - with V.set_graph_handler(graph): - graph.run(*example_inputs) - code = graph.codegen() - fake_callable.set_code(code) - - return fake_callable - - -def extract_inductor_code(kernel, example_inputs): - model = fx.symbolic_trace(kernel) - compile_fx.compile_fx(model, example_inputs_=example_inputs, inner_compile=compile_fx_inner_cpu) - - code = fake_callable.code - - # By default, Pytorch compiles a Python module with all the C++ kernel with unified name kernel. - # The actual kernel name should be kernel.__name__. - # TODO: fix this after rewriting inductor codegen to all C++ instead of a Python module - kernel_name = kernel.__name__ - - return code, kernel_name diff --git a/python/matx/torch_compiler/utils/__init__.py b/python/matx/torch_compiler/codegen/__init__.py similarity index 100% rename from python/matx/torch_compiler/utils/__init__.py rename to python/matx/torch_compiler/codegen/__init__.py diff --git a/python/matx/torch_compiler/codegen/inductor/__init__.py b/python/matx/torch_compiler/codegen/inductor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/matx/torch_compiler/codegen/matx_formatter.py b/python/matx/torch_compiler/codegen/matx_formatter.py new file mode 100644 index 00000000..36cce17f --- /dev/null +++ b/python/matx/torch_compiler/codegen/matx_formatter.py @@ -0,0 +1,321 @@ +# Copyright 2022 ByteDance Ltd. and/or its affiliates. +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Utilities to format kernel code generated by inductor to a JITOp +""" + +import copy +import logging +from typing import List + +import torch + +from .utils import cpp_parse + +log = logging.getLogger(__name__) + +MAGIC_NUMBER = '2_71828182846' + +MATX_INCLUDE = ''' +#include "matxscript/runtime/codegen_all_includes.h" +#include + +using namespace ::matxscript::runtime; +extern "C" void* __matxscript_module_ctx = NULL; + +extern "C" MATX_DLL MATXScriptFuncRegistry __matxscript_func_registry__; + + + +''' + +SESSION_HANLDER = cpp_parse.CPPArg(name=f'handle_{MAGIC_NUMBER}', + type=cpp_parse.CPPType(name='void', is_pointer=True)) +SESSION_HANLDER_WITH_DEAFULT = cpp_parse.CPPArg(name=f'handle_{MAGIC_NUMBER}', + type=cpp_parse.CPPType(name='void', is_pointer=True), + default_val='((void*)(int64_t)0)') + +CREATE_NDARRAY_DECLARATION = ''' +// helper function to create NDArray +NDArray createNDArray(const std::string& dtype, + const std::string& device, + const std::vector& arg_shape); +''' + +CREATE_NDARRAY_IMPLEMENTATION = ''' +NDArray createNDArray(const std::string& dtype, + const std::string& device, + const std::vector& arg_shape) { + Unicode dtype_str(UTF8Decode(dtype)); + Unicode ctx_str(UTF8Decode(device)); + DataType data_type(String2DLDataType(UTF8Encode(dtype_str.view()))); + return NDArray::Empty(arg_shape, data_type, NDArrayHelper::GetDevice(ctx_str)); +} +''' + + +def generate_ndarray_arg_cast(arg_name, arg_index, message='TODO'): + return f'internal::TypeAsHelper::run(({arg_name}[{arg_index}]), __FILE__, __LINE__, "{message}", "{message}")' + + +def get_c_api(kernel_name: str, args: List[cpp_parse.CPPArg], has_return_value) -> str: + template_with_return = ''' +int {}__c_api(MATXScriptAny* args, int num_args, MATXScriptAny* out_ret_value, void* resource_handle = nullptr) +{{ + TArgs args_t(args, num_args); + + if (num_args > 0 && args[num_args - 1].code == TypeIndex::kRuntimeKwargs) {{ + string_view arg_names[{}] {{{}}}; + KwargsUnpackHelper helper("{}", arg_names, {}, nullptr, 0); + RTView pos_args[{}]; + helper.unpack(pos_args, args, num_args); // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5 + + auto ret = {}({}, + {}resource_handle); + RTValue(std::move(ret)).MoveToCHost(out_ret_value); + }} else {{ + switch(num_args) {{ + case {}: {{ + auto ret = {}({}, + {}resource_handle); // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5 + RTValue(std::move(ret)).MoveToCHost(out_ret_value); + }} break; + default: {{THROW_PY_TypeError("TODO");}} break; // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5 + }} + }} + + return 0; +}} +''' + assert has_return_value + template = template_with_return + + num_args = len(args) + arg_names_concat_str = ', '.join([f'"{arg.name}"' for arg in args]) + args_dtype = [arg.type.name for arg in args] + + pos_arg_cast_lst = [] + args_t_cast_lst = [] + for arg_index in range(num_args): + pos_arg_cast_lst.append(generate_ndarray_arg_cast('pos_args', arg_index)) + args_t_cast_lst.append(generate_ndarray_arg_cast('args_t', arg_index)) + + kernel_name_indentation = len(kernel_name) * ' ' + if has_return_value: + return_name_indentation = ' ' * 11 + else: + return_name_indentation = '' + pos_arg_cast_indentation = '\n ' + kernel_name_indentation + return_name_indentation + args_t_cast_indentation = '\n ' + kernel_name_indentation + return_name_indentation + pos_arg_cast = (',' + pos_arg_cast_indentation).join(pos_arg_cast_lst) + args_t_cast = (',' + args_t_cast_indentation).join(args_t_cast_lst) + + return template.format(kernel_name, num_args, arg_names_concat_str, kernel_name, num_args, num_args, kernel_name, + pos_arg_cast, kernel_name_indentation, num_args, kernel_name, + args_t_cast, kernel_name_indentation) + + +def get_registration_str(kernel_name): + # TODO: currently, only 1 function is here. + template = ''' +extern "C" {{ + +MATX_DLL MATXScriptBackendPackedCFunc __matxscript_func_array__[] = {{ + (MATXScriptBackendPackedCFunc){}__c_api, +}}; +MATX_DLL MATXScriptFuncRegistry __matxscript_func_registry__ = {{ + "1\\000{}\\000", __matxscript_func_array__, +}}; + +}} // extern C + +extern "C" {{ + +MATX_DLL const char* __matxscript_closures_names__ = "1\\000{}\\000"; + +}} // extern C + + ''' + return template.format(kernel_name, kernel_name, kernel_name) + + +def get_c_api_declare(kernel_name): + return f'int {kernel_name}__c_api(MATXScriptAny*, int, MATXScriptAny*, void*);' + + +def extract_cpp_code(code: str): + return code.split("'''")[1][1:-1] + + +def split_include_kernel(code): + first_newline_idx = code.find('\n') + include_code_str = code[:first_newline_idx] + kernel_code_str = code[first_newline_idx + 1:] + return include_code_str, kernel_code_str + + +def split_declaration_body(kernel_code_str): + first_open_bracket = kernel_code_str.find('{') + kernel_declaration_str = kernel_code_str[:first_open_bracket] + kernel_body_str = kernel_code_str[first_open_bracket:] + return kernel_declaration_str, kernel_body_str + + +def generate_kernel_wrapper_declaration(kernel_name, example_inputs): + return_type = cpp_parse.CPPType(name='Tuple', is_pointer=False) + args = [] + for i in range(len(example_inputs)): + arg = cpp_parse.CPPArg(name=f'in_ptr{i}', type=cpp_parse.CPPType(name='NDArray', is_pointer=False), + is_const=False, is_restricted=False) + args.append(arg) + kernel_wrapper_declaration = cpp_parse.CPPDeclaration(func_name=kernel_name, + return_type=return_type, + args=args, + is_extern_c=False) + return kernel_wrapper_declaration + + +def generate_ndarray_allocate_statement(output_name: str, dtype: str, device: str, shape: List[int]): + assert dtype in ['int32', 'int64', 'float32', 'float64'] + assert device == 'cpu' + assert isinstance(shape, List) + for shape_int in shape: + assert isinstance(shape_int, int) + + shape = [str(shape_int) for shape_int in shape] + shape_str = ', '.join(shape) + + return f'NDArray {output_name} = createNDArray("{dtype}", "{device}", {{{shape_str}}});' + + +def generate_ndarray_cast(var_name, dtype): + return f'({dtype}*){var_name}.Data<{dtype}>()' + + +def generate_kernel_wrapper_return(fake_output): + output_str = [f'out_ptr{i}' for i in range(len(fake_output))] + output_str = ','.join(output_str) + return f'return Kernel_Tuple::make(std::initializer_list{{{output_str}}});' + + +TORCH_DTYPE_TO_NDARRAY_DTYPE = { + torch.float32: 'float32', + torch.float64: 'float64', + torch.int32: 'int32', + torch.int64: 'int64' +} + + +def generate_kernel_wrapper_body(kernel_declaration: cpp_parse.CPPDeclaration, + fake_output: List[torch.Tensor]): + # step 0: obtain output args from kernel_declaration + + # step 1: allocate output NDArray + ndarray_allocate_statements = [] + for i, output in enumerate(fake_output): + assert output.dtype in TORCH_DTYPE_TO_NDARRAY_DTYPE + dtype = TORCH_DTYPE_TO_NDARRAY_DTYPE[output.dtype] + + ndarray_allocate_statement = generate_ndarray_allocate_statement(output_name=f'out_ptr{i}', + dtype=dtype, + device=str(output.device), + shape=list(output.shape)) + ndarray_allocate_statements.append(ndarray_allocate_statement) + + ndarray_allocate_statements = '\n'.join(ndarray_allocate_statements) + '\n\n' + + # step 2: invoke kernel + kernel_invoke_param = [] + for arg in kernel_declaration.args: + kernel_invoke_param.append(generate_ndarray_cast(var_name=arg.name, dtype=arg.type.name)) + + num_space = 10 + delimiter = ',\n' + ' ' * 10 + kernel_invoke_param_str = delimiter.join(kernel_invoke_param) + kernel_invoke_str = kernel_declaration.func_name + '(' + '\n' + ' ' * num_space + \ + kernel_invoke_param_str + '\n' + ')' + '\n' + + # step 3: return output as a Tuple + return_str = generate_kernel_wrapper_return(fake_output) + + # step 4: add bracket + final_result = '\n{\n' + ndarray_allocate_statements + kernel_invoke_str + return_str + '\n}' + + return final_result + + +def matx_cpp_code_format(code: str, kernel_name: str, + example_inputs: List[torch.Tensor], + fake_output: List[torch.Tensor]) -> str: + code = extract_cpp_code(code) + # split include and kernel code + + include_code_str, kernel_code_str = split_include_kernel(code) + # add matx include + include_code_str += MATX_INCLUDE + + # extract kernel declaration + kernel_declaration_str, kernel_body_str = split_declaration_body(kernel_code_str) + + kernel_declaration = cpp_parse.parse_cpp_declaration(kernel_declaration_str) + kernel_return_type = kernel_declaration.return_type.name + assert kernel_return_type == 'void', f'The kernel return type must be void, Got {kernel_return_type}' + + kernel_declaration.func_name += MAGIC_NUMBER + kernel_code_str = str(kernel_declaration) + kernel_body_str + + # here, we keep the original kernel and add a wrapper + kernel_wrapper_declaration = generate_kernel_wrapper_declaration(kernel_name, example_inputs) + kernel_wrapper_body = generate_kernel_wrapper_body(kernel_declaration, fake_output) + + kernel_wrapper_declaration_without_default = copy.deepcopy(kernel_wrapper_declaration) + kernel_wrapper_declaration_without_default.append_arg(SESSION_HANLDER) + kernel_wrapper_declaration_with_default = copy.deepcopy(kernel_wrapper_declaration) + kernel_wrapper_declaration_with_default.append_arg(SESSION_HANLDER_WITH_DEAFULT) + + # create all the declarations strings + function_declaration = [CREATE_NDARRAY_DECLARATION, str(kernel_wrapper_declaration_with_default), + str(kernel_declaration) + ';', get_c_api_declare(kernel_wrapper_declaration.func_name)] + + function_declaration_str = '\n\n'.join(function_declaration) + '\n' + + # create all the kernel implementation strings including + # 1. create ndarray. 2. kernel wrapper, 3. kernel, 4. kernel-c-api + kernel_wrapper = str(kernel_wrapper_declaration) + kernel_wrapper_body + kernel_c_api_impl_str = get_c_api(kernel_name=kernel_wrapper_declaration.func_name, + args=kernel_wrapper_declaration.args, + has_return_value=kernel_wrapper_declaration.return_type.name != 'void') + + implementations = [CREATE_NDARRAY_IMPLEMENTATION, kernel_wrapper, kernel_code_str, kernel_c_api_impl_str] + implementations_str = '\n\n'.join(implementations) + '\n' + + # add namespace + kernel_code_str = ['namespace {', function_declaration_str, implementations_str, '} // namespace'] + kernel_code_str = '\n\n'.join(kernel_code_str) + + # registration str + registration_code_str = get_registration_str(kernel_name=kernel_declaration.func_name) + + # final code + final_code = [include_code_str, kernel_code_str, registration_code_str] + + final_code = '\n\n'.join(final_code) + + return final_code diff --git a/python/matx/torch_compiler/codegen/utils/__init__.py b/python/matx/torch_compiler/codegen/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/matx/torch_compiler/utils/cpp_parse.py b/python/matx/torch_compiler/codegen/utils/cpp_parse.py similarity index 100% rename from python/matx/torch_compiler/utils/cpp_parse.py rename to python/matx/torch_compiler/codegen/utils/cpp_parse.py diff --git a/python/matx/torch_compiler/tests/nested_inputs.py b/python/matx/torch_compiler/tests/nested_inputs.py new file mode 100644 index 00000000..e69de29b diff --git a/test/inductor/test_basic.py b/test/inductor/test_basic.py new file mode 100644 index 00000000..e69de29b From b3c0289fb2eaf6301936dac0c80f652ea570cf07 Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Wed, 1 Feb 2023 15:58:56 +0800 Subject: [PATCH 08/21] add kernel wrapper to match the raw python function signature --- cpp_playground/main.cpp | 3 - python/matx/inductor/__init__.py | 27 +++--- python/matx/toolchain.py | 9 +- .../matx/torch_compiler/codegen/__init__.py | 2 + .../codegen/inductor/__init__.py | 90 +++++++++++++++++++ .../torch_compiler/codegen/matx_formatter.py | 13 +-- .../torch_compiler/tests/nested_inputs.py | 0 .../torch_compiler/tests/simple_inductor.py | 17 ++-- .../matx/torch_compiler/tests/tuple_output.py | 8 ++ 9 files changed, 136 insertions(+), 33 deletions(-) delete mode 100644 cpp_playground/main.cpp delete mode 100644 python/matx/torch_compiler/tests/nested_inputs.py create mode 100644 python/matx/torch_compiler/tests/tuple_output.py diff --git a/cpp_playground/main.cpp b/cpp_playground/main.cpp deleted file mode 100644 index c302b43a..00000000 --- a/cpp_playground/main.cpp +++ /dev/null @@ -1,3 +0,0 @@ -// -// Created by ByteDance on 2023/2/1. -// diff --git a/python/matx/inductor/__init__.py b/python/matx/inductor/__init__.py index 88e87920..e19f2c57 100644 --- a/python/matx/inductor/__init__.py +++ b/python/matx/inductor/__init__.py @@ -2,23 +2,19 @@ from typing import List import torch -from matx.torch_compiler.codegen import extract_inductor_code, matx_cpp_code_format from matx.env import MATX_DEV_MODE from matx.script import context from matx.toolchain import path_prefix +from matx.torch_compiler.codegen import extract_inductor_code, matx_cpp_code_format def from_source(compiling_obj: type, example_inputs: List[torch.Tensor]) -> context.ScriptContext: try: - - code, kernel_name = extract_inductor_code(compiling_obj, example_inputs) - code = matx_cpp_code_format(code, kernel_name) - + # set sc_ctx attributes to be compatible with existing matx code sc_ctx = context.ScriptContext() sc_ctx.build_type = context.BuildType.FUNCTION sc_ctx.main_node.raw = compiling_obj - # set sc_ctx attributes to be compatible with existing matx code inductor_context = context.InductorContext(fn_name=compiling_obj.__name__) sc_ctx.main_node.context = inductor_context # set source code TODO: formatting source code @@ -27,6 +23,18 @@ def from_source(compiling_obj: type, example_inputs: List[torch.Tensor]) -> cont frame = inspect.stack()[3] sc_ctx.main_node.span.file_name = frame[0].f_code.co_filename + # set args types. + from .. import ir + + # TODO: currently, we only support argument as NDArray. We may support nested inputs later + signature = inspect.signature(compiling_obj) + for param in signature.parameters.values(): + sc_ctx.main_node.context.arg_types[param.name] = ir.type.NDArrayType() + + # compile the kernel and set the code + code, kernel_name, fake_output = extract_inductor_code(compiling_obj, example_inputs) + code = matx_cpp_code_format(code, kernel_name, example_inputs, fake_output) + # export code path = path_prefix(sc_ctx) with open(path, 'w') as f: @@ -37,13 +45,6 @@ def from_source(compiling_obj: type, example_inputs: List[torch.Tensor]) -> cont build_module = _ffi.get_global_func("embedded.build.c") sc_ctx.rt_module = build_module(code.encode()) - # set args types. # TODO: hardcode for now - from .. import ir - sc_ctx.main_node.context.arg_types = dict( - a=ir.type.NDArrayType(), - b=ir.type.NDArrayType() - ) - return sc_ctx except BaseException as e: if MATX_DEV_MODE: diff --git a/python/matx/toolchain.py b/python/matx/toolchain.py index bd946627..557bc278 100644 --- a/python/matx/toolchain.py +++ b/python/matx/toolchain.py @@ -400,23 +400,20 @@ def inductor(compiling_obj, example_inputs, *, share=True, toolchain=None, bundl '-I/Users/bytedance/miniforge3/envs/inductor/lib/python3.10/site-packages/torch/include/TH', '-I/Users/bytedance/miniforge3/envs/inductor/lib/python3.10/site-packages/torch/include/THC', '-I/Users/bytedance/miniforge3/envs/inductor/include/python3.10', - '-lgomp', - '-march=native', + # '-lgomp', + # '-march=native', '-ffast-math', '-fno-finite-math-only', - '-fopenmp', + # '-fopenmp', '-DC10_USING_CUSTOM_GENERATED_MACROS' ] - build_dso(result, toolchain is not None, compile_options=torch_compiler_options) if toolchain is not None: toolchain_build(result, toolchain) if result.build_type is context.BuildType.FUNCTION: return make_jit_op_creator(result, share, bundle_args=bundle_args)() - elif result.build_type is context.BuildType.JIT_OBJECT: - return make_jit_object_creator(result, share, bundle_args=bundle_args) else: raise ValueError('Unsupported build_type: {}'.format(result.build_type)) diff --git a/python/matx/torch_compiler/codegen/__init__.py b/python/matx/torch_compiler/codegen/__init__.py index e69de29b..6935d78a 100644 --- a/python/matx/torch_compiler/codegen/__init__.py +++ b/python/matx/torch_compiler/codegen/__init__.py @@ -0,0 +1,2 @@ +from .inductor import extract_inductor_code +from .matx_formatter import matx_cpp_code_format \ No newline at end of file diff --git a/python/matx/torch_compiler/codegen/inductor/__init__.py b/python/matx/torch_compiler/codegen/inductor/__init__.py index e69de29b..837d4448 100644 --- a/python/matx/torch_compiler/codegen/inductor/__init__.py +++ b/python/matx/torch_compiler/codegen/inductor/__init__.py @@ -0,0 +1,90 @@ +from typing import List, Tuple + +import torch +import torch._inductor.compile_fx as compile_fx +from torch import fx +from torch._inductor.debug import DebugContext +from torch._inductor.virtualized import V + +""" +Use a global variable to hack the compile_fx_inner and record the compiled code. +This works in single process problem, but requires careful review in multi-processing +""" + + +class FakeCallableWithCode(): + code = None + + def __call__(self, *args, **kwargs): + raise NotImplementedError + + def set_code(self, code): + self.code = code + + +fake_callable = FakeCallableWithCode() + + +@DebugContext.wrap +@torch.utils._python_dispatch._disable_current_modes() +def compile_fx_inner_cpu( + gm: torch.fx.GraphModule, + example_inputs: List[torch.Tensor], + cudagraphs=None, + num_fixed=0, + is_backward=False, + graph_id=None, +): + # lift the maximum depth of the Python interpreter stack + # to adapt large/deep models + compile_fx.sys.setrecursionlimit(max(compile_fx.sys.getrecursionlimit(), 2000)) + V.debug.fx_graph(gm, example_inputs) + shape_env = compile_fx._shape_env_from_inputs(example_inputs) + fake_mode = compile_fx.fake_mode_from_tensors(example_inputs) + graph = compile_fx.GraphLowering( + gm, + shape_env=shape_env, + num_static_inputs=num_fixed, + graph_id=graph_id, + fake_mode=fake_mode, + ) + with V.set_graph_handler(graph): + graph.run(*example_inputs) + code = graph.codegen() + fake_callable.set_code(code) + + return fake_callable + + +def assert_tuple_of_tensors(tensors): + assert isinstance(tensors, Tuple) + for tensor in tensors: + assert isinstance(tensor, torch.Tensor), 'Each element in tensors must be a torch.Tensor' + + +from torch._subclasses import FakeTensor, FakeTensorMode + + +def extract_inductor_code(kernel, example_inputs): + # check kernel input and output. All the input must be a Tensor. The output must be a tuple of Tensor + # TODO: remove this constraints (long term) + assert isinstance(example_inputs, (List, Tuple)) + example_inputs = tuple(example_inputs) + assert_tuple_of_tensors(example_inputs) + fake_mode = FakeTensorMode() + fake_example_inputs = [FakeTensor.from_tensor(t, fake_mode=fake_mode) for t in example_inputs] + fake_output = kernel(*fake_example_inputs) + assert_tuple_of_tensors(fake_output) + + model = fx.symbolic_trace(kernel) + compile_fx.compile_fx(model, example_inputs_=fake_example_inputs, inner_compile=compile_fx_inner_cpu) + + code = fake_callable.code + + # By default, Pytorch compiles a Python module with all the C++ kernel with unified name kernel. + # The actual kernel name should be kernel.__name__. + # TODO: fix this after rewriting inductor codegen to all C++ instead of a Python module + kernel_name = kernel.__name__ + + # fake_output is used + return code, kernel_name, fake_output diff --git a/python/matx/torch_compiler/codegen/matx_formatter.py b/python/matx/torch_compiler/codegen/matx_formatter.py index 36cce17f..0e740fc9 100644 --- a/python/matx/torch_compiler/codegen/matx_formatter.py +++ b/python/matx/torch_compiler/codegen/matx_formatter.py @@ -35,6 +35,7 @@ MATX_INCLUDE = ''' #include "matxscript/runtime/codegen_all_includes.h" +#include "matxscript/runtime/container/ndarray_helper.h" #include using namespace ::matxscript::runtime; @@ -250,7 +251,7 @@ def generate_kernel_wrapper_body(kernel_declaration: cpp_parse.CPPDeclaration, delimiter = ',\n' + ' ' * 10 kernel_invoke_param_str = delimiter.join(kernel_invoke_param) kernel_invoke_str = kernel_declaration.func_name + '(' + '\n' + ' ' * num_space + \ - kernel_invoke_param_str + '\n' + ')' + '\n' + kernel_invoke_param_str + '\n' + ');' + '\n' # step 3: return output as a Tuple return_str = generate_kernel_wrapper_return(fake_output) @@ -269,7 +270,7 @@ def matx_cpp_code_format(code: str, kernel_name: str, include_code_str, kernel_code_str = split_include_kernel(code) # add matx include - include_code_str += MATX_INCLUDE + include_code_str = MATX_INCLUDE # extract kernel declaration kernel_declaration_str, kernel_body_str = split_declaration_body(kernel_code_str) @@ -278,7 +279,7 @@ def matx_cpp_code_format(code: str, kernel_name: str, kernel_return_type = kernel_declaration.return_type.name assert kernel_return_type == 'void', f'The kernel return type must be void, Got {kernel_return_type}' - kernel_declaration.func_name += MAGIC_NUMBER + kernel_declaration.func_name += MAGIC_NUMBER # TODO: currently, we simply add magic number to avoid conflict kernel_code_str = str(kernel_declaration) + kernel_body_str # here, we keep the original kernel and add a wrapper @@ -291,14 +292,14 @@ def matx_cpp_code_format(code: str, kernel_name: str, kernel_wrapper_declaration_with_default.append_arg(SESSION_HANLDER_WITH_DEAFULT) # create all the declarations strings - function_declaration = [CREATE_NDARRAY_DECLARATION, str(kernel_wrapper_declaration_with_default), + function_declaration = [CREATE_NDARRAY_DECLARATION, str(kernel_wrapper_declaration_with_default) + ';', str(kernel_declaration) + ';', get_c_api_declare(kernel_wrapper_declaration.func_name)] function_declaration_str = '\n\n'.join(function_declaration) + '\n' # create all the kernel implementation strings including # 1. create ndarray. 2. kernel wrapper, 3. kernel, 4. kernel-c-api - kernel_wrapper = str(kernel_wrapper_declaration) + kernel_wrapper_body + kernel_wrapper = str(kernel_wrapper_declaration_without_default) + kernel_wrapper_body kernel_c_api_impl_str = get_c_api(kernel_name=kernel_wrapper_declaration.func_name, args=kernel_wrapper_declaration.args, has_return_value=kernel_wrapper_declaration.return_type.name != 'void') @@ -311,7 +312,7 @@ def matx_cpp_code_format(code: str, kernel_name: str, kernel_code_str = '\n\n'.join(kernel_code_str) # registration str - registration_code_str = get_registration_str(kernel_name=kernel_declaration.func_name) + registration_code_str = get_registration_str(kernel_name=kernel_wrapper_declaration.func_name) # final code final_code = [include_code_str, kernel_code_str, registration_code_str] diff --git a/python/matx/torch_compiler/tests/nested_inputs.py b/python/matx/torch_compiler/tests/nested_inputs.py deleted file mode 100644 index e69de29b..00000000 diff --git a/python/matx/torch_compiler/tests/simple_inductor.py b/python/matx/torch_compiler/tests/simple_inductor.py index a48ed2cf..1e7b7255 100644 --- a/python/matx/torch_compiler/tests/simple_inductor.py +++ b/python/matx/torch_compiler/tests/simple_inductor.py @@ -1,10 +1,13 @@ import json +import numpy as np + import matx import torch -@matx.inductor(example_inputs=[torch.randn(5), torch.randn(5)]) +@matx.inductor(example_inputs=[torch.from_numpy(np.random.randn(5).astype(np.int32)), + torch.from_numpy(np.random.randn(5).astype(np.int32))]) def add_relu(a: matx.NDArray, b: matx.NDArray): c = a + b c = torch.nn.functional.relu(c) @@ -19,11 +22,10 @@ def add_json(a: str, b: str) -> str: a_list = json.loads(a) b_list = json.loads(b) - a_tensor = matx.NDArray(arr=a_list, shape=[5], dtype='float32') - b_tensor = matx.NDArray(arr=b_list, shape=[5], dtype='float32') - c_tensor = matx.NDArray(arr=b_list, shape=[5], dtype='float32') + a_tensor = matx.NDArray(arr=a_list, shape=[5], dtype='int32') + b_tensor = matx.NDArray(arr=b_list, shape=[5], dtype='int32') - add_relu(a_tensor, b_tensor, c_tensor) + c_tensor = add_relu(a_tensor, b_tensor)[0] result_lst = c_tensor.tolist() @@ -31,6 +33,11 @@ def add_json(a: str, b: str) -> str: if __name__ == '__main__': + a_tensor = matx.NDArray(arr=[1, 2, 3, 4, 5], shape=[5], dtype='int32') + b_tensor = matx.NDArray(arr=[6, 7, 8, 8, 10], shape=[5], dtype='int32') + c_tensor = add_relu(a_tensor, b_tensor) + print(c_tensor) + print(f'Pytorch version {torch.__version__}') a = json.dumps([1, 2, 3, 4, 5]) b = json.dumps([6, 7, 8, 9, 10]) diff --git a/python/matx/torch_compiler/tests/tuple_output.py b/python/matx/torch_compiler/tests/tuple_output.py new file mode 100644 index 00000000..3e80cba4 --- /dev/null +++ b/python/matx/torch_compiler/tests/tuple_output.py @@ -0,0 +1,8 @@ +import matx + +from typing import Tuple + + +@matx.script +def func(a: int, b: int) -> Tuple[int, int]: + return a, b From df1f9bcb74e552e61e92b1ee7efe837f2a25516a Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Wed, 1 Feb 2023 17:03:08 +0800 Subject: [PATCH 09/21] add license, add basic tests --- python/matx/__init__.py | 3 +- python/matx/inductor/__init__.py | 19 +++++ python/matx/torch_compiler/__init__.py | 19 +++++ .../matx/torch_compiler/codegen/__init__.py | 19 +++++ .../codegen/inductor/__init__.py | 19 +++++ .../torch_compiler/codegen/utils/__init__.py | 18 +++++ .../torch_compiler/codegen/utils/cpp_parse.py | 19 +++++ .../matx/torch_compiler/tests/tuple_output.py | 8 --- test/inductor/test_basic.py | 69 +++++++++++++++++++ 9 files changed, 184 insertions(+), 9 deletions(-) delete mode 100644 python/matx/torch_compiler/tests/tuple_output.py diff --git a/python/matx/__init__.py b/python/matx/__init__.py index 78cb0d35..4bab2723 100644 --- a/python/matx/__init__.py +++ b/python/matx/__init__.py @@ -351,7 +351,7 @@ def script(compiling_obj, *args, backend=None, **kwargs): return toolchain.script(compiling_obj, *args, **kwargs) -def inductor(example_inputs, **kwargs): +def inductor_script(example_inputs, **kwargs): """ Args: @@ -361,6 +361,7 @@ def inductor(example_inputs, **kwargs): Returns: a wrapper that compiles the compiling_obj into a JIT FUNCTION """ + def inner_inductor(compiling_obj): return toolchain.inductor(compiling_obj, example_inputs, **kwargs) diff --git a/python/matx/inductor/__init__.py b/python/matx/inductor/__init__.py index e19f2c57..c09dd4a5 100644 --- a/python/matx/inductor/__init__.py +++ b/python/matx/inductor/__init__.py @@ -1,3 +1,22 @@ +# Copyright 2022 ByteDance Ltd. and/or its affiliates. +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import inspect from typing import List diff --git a/python/matx/torch_compiler/__init__.py b/python/matx/torch_compiler/__init__.py index e83ac962..2fe03049 100644 --- a/python/matx/torch_compiler/__init__.py +++ b/python/matx/torch_compiler/__init__.py @@ -1,3 +1,22 @@ +# Copyright 2022 ByteDance Ltd. and/or its affiliates. +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + minimum_torch_version = '2.0.0a0' try: diff --git a/python/matx/torch_compiler/codegen/__init__.py b/python/matx/torch_compiler/codegen/__init__.py index 6935d78a..1dd70d50 100644 --- a/python/matx/torch_compiler/codegen/__init__.py +++ b/python/matx/torch_compiler/codegen/__init__.py @@ -1,2 +1,21 @@ +# Copyright 2022 ByteDance Ltd. and/or its affiliates. +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from .inductor import extract_inductor_code from .matx_formatter import matx_cpp_code_format \ No newline at end of file diff --git a/python/matx/torch_compiler/codegen/inductor/__init__.py b/python/matx/torch_compiler/codegen/inductor/__init__.py index 837d4448..95d543b2 100644 --- a/python/matx/torch_compiler/codegen/inductor/__init__.py +++ b/python/matx/torch_compiler/codegen/inductor/__init__.py @@ -1,3 +1,22 @@ +# Copyright 2022 ByteDance Ltd. and/or its affiliates. +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import List, Tuple import torch diff --git a/python/matx/torch_compiler/codegen/utils/__init__.py b/python/matx/torch_compiler/codegen/utils/__init__.py index e69de29b..84bf20b4 100644 --- a/python/matx/torch_compiler/codegen/utils/__init__.py +++ b/python/matx/torch_compiler/codegen/utils/__init__.py @@ -0,0 +1,18 @@ +# Copyright 2022 ByteDance Ltd. and/or its affiliates. +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. \ No newline at end of file diff --git a/python/matx/torch_compiler/codegen/utils/cpp_parse.py b/python/matx/torch_compiler/codegen/utils/cpp_parse.py index e0d9611d..499c0b9f 100644 --- a/python/matx/torch_compiler/codegen/utils/cpp_parse.py +++ b/python/matx/torch_compiler/codegen/utils/cpp_parse.py @@ -1,3 +1,22 @@ +# Copyright 2022 ByteDance Ltd. and/or its affiliates. +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import dataclasses from typing import List, Union diff --git a/python/matx/torch_compiler/tests/tuple_output.py b/python/matx/torch_compiler/tests/tuple_output.py deleted file mode 100644 index 3e80cba4..00000000 --- a/python/matx/torch_compiler/tests/tuple_output.py +++ /dev/null @@ -1,8 +0,0 @@ -import matx - -from typing import Tuple - - -@matx.script -def func(a: int, b: int) -> Tuple[int, int]: - return a, b diff --git a/test/inductor/test_basic.py b/test/inductor/test_basic.py index e69de29b..e60c0a29 100644 --- a/test/inductor/test_basic.py +++ b/test/inductor/test_basic.py @@ -0,0 +1,69 @@ +# Copyright 2022 ByteDance Ltd. and/or its affiliates. +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import unittest +import matx +import torch +import numpy as np + + +class BasicTests(unittest.TestCase): + def test_basics(self): + from matx import toolchain + toolchain.USE_SO_CACHE = False + + def add_relu(a, b): + c = a + b + c = torch.nn.functional.relu(c) + return c, + + sizes = [(5,), (10,), (2, 3), (4, 5, 6)] + dtypes = [np.float32, np.float64, np.int32, np.int64] + + for size in sizes: + for dtype in dtypes: + a_numpy = np.random.randn(*size).astype(dtype) + b_numpy = np.random.randn(*size).astype(dtype) + + example_inputs = [torch.from_numpy(np.random.randn(*size).astype(dtype)), + torch.from_numpy(np.random.randn(*size).astype(dtype))] + + add_relu_kernel = matx.inductor_script(example_inputs)(add_relu) + + a_tensor = torch.from_numpy(a_numpy) + b_tensor = torch.from_numpy(b_numpy) + + a_ndarray = matx.NDArray([], a_numpy.shape, str(a_numpy.dtype)) + a_ndarray.from_numpy(a_numpy) + b_ndarray = matx.NDArray([], b_numpy.shape, str(b_numpy.dtype)) + b_ndarray.from_numpy(b_numpy) + + c_tensor_expected = add_relu(a_tensor, b_tensor)[0] + c_ndarray: matx.NDArray = add_relu_kernel(a_ndarray, b_ndarray)[0] + c_tensor = c_ndarray.torch() + + # TODO: there seems a strange cache behavior of JITOp, without the following line, + # it fails. + del add_relu_kernel + + torch.testing.assert_close(c_tensor_expected, c_tensor) + + +if __name__ == '__main__': + unittest.main() From 6aebd22806e335c3870016d3b2adb29373e8ba2e Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Fri, 3 Feb 2023 16:48:49 +0800 Subject: [PATCH 10/21] update codegen to match new NDArray impl --- python/matx/pipeline/_register_conveter.py | 15 +++ python/matx/toolchain.py | 30 +++--- .../matx/torch_compiler/codegen/__init__.py | 2 +- .../codegen/inductor/__init__.py | 5 +- .../torch_compiler/codegen/matx_formatter.py | 97 ++++++++++++------- .../torch_compiler/codegen/utils/__init__.py | 2 +- .../torch_compiler/tests/simple_inductor.py | 7 +- test/inductor/test_basic.py | 16 ++- 8 files changed, 111 insertions(+), 63 deletions(-) diff --git a/python/matx/pipeline/_register_conveter.py b/python/matx/pipeline/_register_conveter.py index 10df789f..c28e7246 100644 --- a/python/matx/pipeline/_register_conveter.py +++ b/python/matx/pipeline/_register_conveter.py @@ -17,6 +17,17 @@ # specific language governing permissions and limitations # under the License. + +try: + # TODO: consider lazy import this after users called matx.inductor_script + import torch + import torch.utils.dlpack + + HAS_TORCH = True +except: + HAS_TORCH = False + +import matx from .._ffi._selector import _set_fast_pipeline_object_converter from .._ffi._selector import _set_class_symbol from .symbol import BaseSymbol @@ -29,9 +40,13 @@ def _pipeline_object_converter(value): return value.native_op if isinstance(value, OpKernel): return value.native_op + if HAS_TORCH and isinstance(value, torch.Tensor): + return matx.array.from_dlpack(torch.utils.dlpack.to_dlpack(value)) return value _PipelineClasses = (JitObject, OpKernel,) +if HAS_TORCH: + _PipelineClasses += (torch.Tensor,) _set_fast_pipeline_object_converter(_PipelineClasses, _pipeline_object_converter) _set_class_symbol(BaseSymbol) diff --git a/python/matx/toolchain.py b/python/matx/toolchain.py index 557bc278..9018fbce 100644 --- a/python/matx/toolchain.py +++ b/python/matx/toolchain.py @@ -389,24 +389,24 @@ def inductor(compiling_obj, example_inputs, *, share=True, toolchain=None, bundl if DISABLE_SCRIPT: return compiling_obj - from matx.inductor import from_source + from .inductor import from_source result: context.ScriptContext = from_source(compiling_obj, example_inputs) - # TODO: get Pytorch additional compiler flags. Hardcode here for mvp - torch_compiler_options = [ - '-I/Users/bytedance/miniforge3/envs/inductor/lib/python3.10/site-packages/torch/include', - '-I/Users/bytedance/miniforge3/envs/inductor/lib/python3.10/site-packages/torch/include/torch/csrc/api/include', - '-I/Users/bytedance/miniforge3/envs/inductor/lib/python3.10/site-packages/torch/include/TH', - '-I/Users/bytedance/miniforge3/envs/inductor/lib/python3.10/site-packages/torch/include/THC', - '-I/Users/bytedance/miniforge3/envs/inductor/include/python3.10', - # '-lgomp', - # '-march=native', - '-ffast-math', - '-fno-finite-math-only', - # '-fopenmp', - '-DC10_USING_CUSTOM_GENERATED_MACROS' - ] + from torch._inductor import codecache + ipaths, lpaths, libs, macros = codecache.get_include_and_linking_paths(include_pytorch=False) + + # TODO: check whether the following flags are handled by common flags + # codecache.get_shared() + # codecache.optimization_flags() + # codecache.cpp_flags() + # codecache.get_warning_all_flag() + # codecache.use_custom_generated_macros() + + torch_compiler_options = ipaths.split() + lpaths.split() + libs.split() + macros.split() + + # TODO: fix this on macOS m1. + torch_compiler_options.remove('-lgomp') build_dso(result, toolchain is not None, compile_options=torch_compiler_options) if toolchain is not None: diff --git a/python/matx/torch_compiler/codegen/__init__.py b/python/matx/torch_compiler/codegen/__init__.py index 1dd70d50..9ad89473 100644 --- a/python/matx/torch_compiler/codegen/__init__.py +++ b/python/matx/torch_compiler/codegen/__init__.py @@ -18,4 +18,4 @@ # under the License. from .inductor import extract_inductor_code -from .matx_formatter import matx_cpp_code_format \ No newline at end of file +from .matx_formatter import matx_cpp_code_format diff --git a/python/matx/torch_compiler/codegen/inductor/__init__.py b/python/matx/torch_compiler/codegen/inductor/__init__.py index 95d543b2..a85e6c16 100644 --- a/python/matx/torch_compiler/codegen/inductor/__init__.py +++ b/python/matx/torch_compiler/codegen/inductor/__init__.py @@ -96,7 +96,10 @@ def extract_inductor_code(kernel, example_inputs): assert_tuple_of_tensors(fake_output) model = fx.symbolic_trace(kernel) - compile_fx.compile_fx(model, example_inputs_=fake_example_inputs, inner_compile=compile_fx_inner_cpu) + compile_fx.compile_fx( + model, + example_inputs_=fake_example_inputs, + inner_compile=compile_fx_inner_cpu) code = fake_callable.code diff --git a/python/matx/torch_compiler/codegen/matx_formatter.py b/python/matx/torch_compiler/codegen/matx_formatter.py index 0e740fc9..f583c705 100644 --- a/python/matx/torch_compiler/codegen/matx_formatter.py +++ b/python/matx/torch_compiler/codegen/matx_formatter.py @@ -35,7 +35,6 @@ MATX_INCLUDE = ''' #include "matxscript/runtime/codegen_all_includes.h" -#include "matxscript/runtime/container/ndarray_helper.h" #include using namespace ::matxscript::runtime; @@ -49,25 +48,19 @@ SESSION_HANLDER = cpp_parse.CPPArg(name=f'handle_{MAGIC_NUMBER}', type=cpp_parse.CPPType(name='void', is_pointer=True)) -SESSION_HANLDER_WITH_DEAFULT = cpp_parse.CPPArg(name=f'handle_{MAGIC_NUMBER}', - type=cpp_parse.CPPType(name='void', is_pointer=True), - default_val='((void*)(int64_t)0)') - -CREATE_NDARRAY_DECLARATION = ''' -// helper function to create NDArray -NDArray createNDArray(const std::string& dtype, - const std::string& device, - const std::vector& arg_shape); -''' +SESSION_HANLDER_WITH_DEAFULT = cpp_parse.CPPArg( + name=f'handle_{MAGIC_NUMBER}', type=cpp_parse.CPPType( + name='void', is_pointer=True), default_val='((void*)(int64_t)0)') CREATE_NDARRAY_IMPLEMENTATION = ''' -NDArray createNDArray(const std::string& dtype, - const std::string& device, - const std::vector& arg_shape) { +NDArray createNDArray(const std::string& dtype, const std::string& device, const List& arg_shape) { Unicode dtype_str(UTF8Decode(dtype)); Unicode ctx_str(UTF8Decode(device)); - DataType data_type(String2DLDataType(UTF8Encode(dtype_str.view()))); - return NDArray::Empty(arg_shape, data_type, NDArrayHelper::GetDevice(ctx_str)); + + auto a = Kernel_NDArray::make(0., arg_shape, dtype_str, ctx_str); + // set impl to torch.Tensor + a.SetImpl(NDArray::Impl::torchTensor); + return a; } ''' @@ -88,13 +81,13 @@ def get_c_api(kernel_name: str, args: List[cpp_parse.CPPArg], has_return_value) RTView pos_args[{}]; helper.unpack(pos_args, args, num_args); // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5 - auto ret = {}({}, + auto ret = {}({}, {}resource_handle); RTValue(std::move(ret)).MoveToCHost(out_ret_value); }} else {{ switch(num_args) {{ case {}: {{ - auto ret = {}({}, + auto ret = {}({}, {}resource_handle); // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5 RTValue(std::move(ret)).MoveToCHost(out_ret_value); }} break; @@ -128,9 +121,20 @@ def get_c_api(kernel_name: str, args: List[cpp_parse.CPPArg], has_return_value) pos_arg_cast = (',' + pos_arg_cast_indentation).join(pos_arg_cast_lst) args_t_cast = (',' + args_t_cast_indentation).join(args_t_cast_lst) - return template.format(kernel_name, num_args, arg_names_concat_str, kernel_name, num_args, num_args, kernel_name, - pos_arg_cast, kernel_name_indentation, num_args, kernel_name, - args_t_cast, kernel_name_indentation) + return template.format( + kernel_name, + num_args, + arg_names_concat_str, + kernel_name, + num_args, + num_args, + kernel_name, + pos_arg_cast, + kernel_name_indentation, + num_args, + kernel_name, + args_t_cast, + kernel_name_indentation) def get_registration_str(kernel_name): @@ -183,8 +187,13 @@ def generate_kernel_wrapper_declaration(kernel_name, example_inputs): return_type = cpp_parse.CPPType(name='Tuple', is_pointer=False) args = [] for i in range(len(example_inputs)): - arg = cpp_parse.CPPArg(name=f'in_ptr{i}', type=cpp_parse.CPPType(name='NDArray', is_pointer=False), - is_const=False, is_restricted=False) + arg = cpp_parse.CPPArg( + name=f'in_ptr{i}', + type=cpp_parse.CPPType( + name='NDArray', + is_pointer=False), + is_const=False, + is_restricted=False) args.append(arg) kernel_wrapper_declaration = cpp_parse.CPPDeclaration(func_name=kernel_name, return_type=return_type, @@ -193,7 +202,11 @@ def generate_kernel_wrapper_declaration(kernel_name, example_inputs): return kernel_wrapper_declaration -def generate_ndarray_allocate_statement(output_name: str, dtype: str, device: str, shape: List[int]): +def generate_ndarray_allocate_statement( + output_name: str, + dtype: str, + device: str, + shape: List[int]): assert dtype in ['int32', 'int64', 'float32', 'float64'] assert device == 'cpu' assert isinstance(shape, List) @@ -251,7 +264,7 @@ def generate_kernel_wrapper_body(kernel_declaration: cpp_parse.CPPDeclaration, delimiter = ',\n' + ' ' * 10 kernel_invoke_param_str = delimiter.join(kernel_invoke_param) kernel_invoke_str = kernel_declaration.func_name + '(' + '\n' + ' ' * num_space + \ - kernel_invoke_param_str + '\n' + ');' + '\n' + kernel_invoke_param_str + '\n' + ');' + '\n' # step 3: return output as a Tuple return_str = generate_kernel_wrapper_return(fake_output) @@ -279,7 +292,8 @@ def matx_cpp_code_format(code: str, kernel_name: str, kernel_return_type = kernel_declaration.return_type.name assert kernel_return_type == 'void', f'The kernel return type must be void, Got {kernel_return_type}' - kernel_declaration.func_name += MAGIC_NUMBER # TODO: currently, we simply add magic number to avoid conflict + # TODO: currently, we simply add magic number to avoid conflict + kernel_declaration.func_name += MAGIC_NUMBER kernel_code_str = str(kernel_declaration) + kernel_body_str # here, we keep the original kernel and add a wrapper @@ -292,23 +306,38 @@ def matx_cpp_code_format(code: str, kernel_name: str, kernel_wrapper_declaration_with_default.append_arg(SESSION_HANLDER_WITH_DEAFULT) # create all the declarations strings - function_declaration = [CREATE_NDARRAY_DECLARATION, str(kernel_wrapper_declaration_with_default) + ';', - str(kernel_declaration) + ';', get_c_api_declare(kernel_wrapper_declaration.func_name)] + CREATE_NDARRAY_DECLARATION = split_declaration_body(CREATE_NDARRAY_IMPLEMENTATION)[0] + ';' + + function_declaration = [ + CREATE_NDARRAY_DECLARATION, + str(kernel_wrapper_declaration_with_default) + ';', + str(kernel_declaration) + ';', + get_c_api_declare( + kernel_wrapper_declaration.func_name)] function_declaration_str = '\n\n'.join(function_declaration) + '\n' # create all the kernel implementation strings including # 1. create ndarray. 2. kernel wrapper, 3. kernel, 4. kernel-c-api kernel_wrapper = str(kernel_wrapper_declaration_without_default) + kernel_wrapper_body - kernel_c_api_impl_str = get_c_api(kernel_name=kernel_wrapper_declaration.func_name, - args=kernel_wrapper_declaration.args, - has_return_value=kernel_wrapper_declaration.return_type.name != 'void') - - implementations = [CREATE_NDARRAY_IMPLEMENTATION, kernel_wrapper, kernel_code_str, kernel_c_api_impl_str] + kernel_c_api_impl_str = get_c_api( + kernel_name=kernel_wrapper_declaration.func_name, + args=kernel_wrapper_declaration.args, + has_return_value=kernel_wrapper_declaration.return_type.name != 'void') + + implementations = [ + CREATE_NDARRAY_IMPLEMENTATION, + kernel_wrapper, + kernel_code_str, + kernel_c_api_impl_str] implementations_str = '\n\n'.join(implementations) + '\n' # add namespace - kernel_code_str = ['namespace {', function_declaration_str, implementations_str, '} // namespace'] + kernel_code_str = [ + 'namespace {', + function_declaration_str, + implementations_str, + '} // namespace'] kernel_code_str = '\n\n'.join(kernel_code_str) # registration str diff --git a/python/matx/torch_compiler/codegen/utils/__init__.py b/python/matx/torch_compiler/codegen/utils/__init__.py index 84bf20b4..9e19ab85 100644 --- a/python/matx/torch_compiler/codegen/utils/__init__.py +++ b/python/matx/torch_compiler/codegen/utils/__init__.py @@ -15,4 +15,4 @@ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations -# under the License. \ No newline at end of file +# under the License. diff --git a/python/matx/torch_compiler/tests/simple_inductor.py b/python/matx/torch_compiler/tests/simple_inductor.py index 1e7b7255..f611743f 100644 --- a/python/matx/torch_compiler/tests/simple_inductor.py +++ b/python/matx/torch_compiler/tests/simple_inductor.py @@ -6,8 +6,8 @@ import torch -@matx.inductor(example_inputs=[torch.from_numpy(np.random.randn(5).astype(np.int32)), - torch.from_numpy(np.random.randn(5).astype(np.int32))]) +@matx.inductor_script(example_inputs=[torch.from_numpy(np.random.randn(5).astype(np.int32)), + torch.from_numpy(np.random.randn(5).astype(np.int32))]) def add_relu(a: matx.NDArray, b: matx.NDArray): c = a + b c = torch.nn.functional.relu(c) @@ -35,6 +35,9 @@ def add_json(a: str, b: str) -> str: if __name__ == '__main__': a_tensor = matx.NDArray(arr=[1, 2, 3, 4, 5], shape=[5], dtype='int32') b_tensor = matx.NDArray(arr=[6, 7, 8, 8, 10], shape=[5], dtype='int32') + + a_tensor = a_tensor.torch(copy=True) + c_tensor = add_relu(a_tensor, b_tensor) print(c_tensor) diff --git a/test/inductor/test_basic.py b/test/inductor/test_basic.py index e60c0a29..5373a6af 100644 --- a/test/inductor/test_basic.py +++ b/test/inductor/test_basic.py @@ -24,7 +24,9 @@ class BasicTests(unittest.TestCase): + def test_basics(self): + # TODO: fix cache_hit issues. from matx import toolchain toolchain.USE_SO_CACHE = False @@ -49,21 +51,17 @@ def add_relu(a, b): a_tensor = torch.from_numpy(a_numpy) b_tensor = torch.from_numpy(b_numpy) - a_ndarray = matx.NDArray([], a_numpy.shape, str(a_numpy.dtype)) - a_ndarray.from_numpy(a_numpy) - b_ndarray = matx.NDArray([], b_numpy.shape, str(b_numpy.dtype)) - b_ndarray.from_numpy(b_numpy) - c_tensor_expected = add_relu(a_tensor, b_tensor)[0] - c_ndarray: matx.NDArray = add_relu_kernel(a_ndarray, b_ndarray)[0] - c_tensor = c_ndarray.torch() + c_tensor = add_relu_kernel(a_tensor, b_tensor)[0] - # TODO: there seems a strange cache behavior of JITOp, without the following line, - # it fails. + # TODO: there seems a strange cache behavior of JITOp, without the + # following line, it fails. del add_relu_kernel torch.testing.assert_close(c_tensor_expected, c_tensor) + toolchain.USE_SO_CACHE = True + if __name__ == '__main__': unittest.main() From e6459c467621c01abfa50b4329976e601a74281d Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Mon, 6 Feb 2023 12:30:17 +0800 Subject: [PATCH 11/21] fix matx.inductor cache_hit --- python/matx/__init__.py | 2 +- python/matx/inductor/context/__init__.py | 0 .../script/analysis/build_type_analysis.py | 2 +- .../matx/script/context/inductor_context.py | 4 +- python/matx/{ => script}/inductor/__init__.py | 66 ++++++++++--------- python/matx/script/inductor/tensor_spec.py | 30 +++++++++ python/matx/toolchain.py | 32 +++++++-- test/inductor/test_basic.py | 13 +--- 8 files changed, 100 insertions(+), 49 deletions(-) delete mode 100644 python/matx/inductor/context/__init__.py rename python/matx/{ => script}/inductor/__init__.py (52%) create mode 100644 python/matx/script/inductor/tensor_spec.py diff --git a/python/matx/__init__.py b/python/matx/__init__.py index 034c4a72..697c6c19 100644 --- a/python/matx/__init__.py +++ b/python/matx/__init__.py @@ -351,7 +351,7 @@ def script(compiling_obj, *args, backend=None, **kwargs): return toolchain.script(compiling_obj, *args, **kwargs) -def inductor_script(example_inputs, **kwargs): +def inductor(example_inputs, **kwargs): """ Args: diff --git a/python/matx/inductor/context/__init__.py b/python/matx/inductor/context/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/python/matx/script/analysis/build_type_analysis.py b/python/matx/script/analysis/build_type_analysis.py index 6cfd0257..7d43cb84 100644 --- a/python/matx/script/analysis/build_type_analysis.py +++ b/python/matx/script/analysis/build_type_analysis.py @@ -30,7 +30,7 @@ def run(self, sc_ctx: context.ScriptContext): node_ctx = sc_ctx.main_node.context if isinstance(node_ctx, context.ClassContext): build_type = context.BuildType.JIT_OBJECT - elif isinstance(node_ctx, context.FunctionContext): + elif isinstance(node_ctx, (context.FunctionContext, context.InductorContext)): build_type = context.BuildType.FUNCTION else: raise RuntimeError("Only one-function, one-class source code is allowed") diff --git a/python/matx/script/context/inductor_context.py b/python/matx/script/context/inductor_context.py index ec775a1f..cc3d1c99 100644 --- a/python/matx/script/context/inductor_context.py +++ b/python/matx/script/context/inductor_context.py @@ -20,11 +20,13 @@ class InductorContext(object): def __init__(self, - fn_name: str = '', ): + fn_name: str = '', + example_inputs_spec=None): self.fn_name = fn_name self.unbound_name = fn_name self.return_type = None self.arg_types = {} # Deferred? + self.example_inputs_spec = example_inputs_spec @property def name(self): diff --git a/python/matx/inductor/__init__.py b/python/matx/script/inductor/__init__.py similarity index 52% rename from python/matx/inductor/__init__.py rename to python/matx/script/inductor/__init__.py index c09dd4a5..9c75d904 100644 --- a/python/matx/inductor/__init__.py +++ b/python/matx/script/inductor/__init__.py @@ -22,48 +22,54 @@ import torch -from matx.env import MATX_DEV_MODE -from matx.script import context -from matx.toolchain import path_prefix from matx.torch_compiler.codegen import extract_inductor_code, matx_cpp_code_format +from .tensor_spec import TensorSpec +from .. import context, analysis +from ... import _ffi +from ... import ir +from ...env import MATX_DEV_MODE + + +def _embedded_inductor_ctx(compiling_obj, example_inputs): + code = _obtain_inductor_code(compiling_obj, example_inputs) + build_module = _ffi.get_global_func("embedded.build.c") + sc_ctx = context.ScriptContext() + sc_ctx.main_node.raw = compiling_obj + if isinstance(code, str): + code = code.encode() + sc_ctx.rt_module = build_module(code) + example_inputs_spec = [TensorSpec.from_tensor(inputs) for inputs in example_inputs] + sc_ctx.main_node.context = context.InductorContext(fn_name=compiling_obj.__name__, + example_inputs_spec=example_inputs_spec) + return sc_ctx + + +def _pass(sc_ctx: context.ScriptContext): + src_anls = analysis.SourceAnalysis() + src_anls.run(sc_ctx) + + +def _obtain_inductor_code(compiling_obj, example_inputs): + # compile the kernel and set the code + code, kernel_name, fake_output = extract_inductor_code(compiling_obj, example_inputs) + code = matx_cpp_code_format(code, kernel_name, example_inputs, fake_output) + return code def from_source(compiling_obj: type, example_inputs: List[torch.Tensor]) -> context.ScriptContext: try: - # set sc_ctx attributes to be compatible with existing matx code - sc_ctx = context.ScriptContext() - sc_ctx.build_type = context.BuildType.FUNCTION - sc_ctx.main_node.raw = compiling_obj - inductor_context = context.InductorContext(fn_name=compiling_obj.__name__) - sc_ctx.main_node.context = inductor_context - # set source code TODO: formatting source code - sc_ctx.main_node.span.source_code = inspect.getsource(compiling_obj) - # set filename. TODO: this is too hack - frame = inspect.stack()[3] - sc_ctx.main_node.span.file_name = frame[0].f_code.co_filename + # TODO: allow generalized way to specify example_inputs + sc_ctx = _embedded_inductor_ctx(compiling_obj, example_inputs) + # set filename. + _pass(sc_ctx) + analysis.BuildTypeAnalysis().run(sc_ctx) # set args types. - from .. import ir - # TODO: currently, we only support argument as NDArray. We may support nested inputs later signature = inspect.signature(compiling_obj) for param in signature.parameters.values(): sc_ctx.main_node.context.arg_types[param.name] = ir.type.NDArrayType() - # compile the kernel and set the code - code, kernel_name, fake_output = extract_inductor_code(compiling_obj, example_inputs) - code = matx_cpp_code_format(code, kernel_name, example_inputs, fake_output) - - # export code - path = path_prefix(sc_ctx) - with open(path, 'w') as f: - f.write(code) - - # set rt_module - from .. import _ffi - build_module = _ffi.get_global_func("embedded.build.c") - sc_ctx.rt_module = build_module(code.encode()) - return sc_ctx except BaseException as e: if MATX_DEV_MODE: diff --git a/python/matx/script/inductor/tensor_spec.py b/python/matx/script/inductor/tensor_spec.py new file mode 100644 index 00000000..bdee4a73 --- /dev/null +++ b/python/matx/script/inductor/tensor_spec.py @@ -0,0 +1,30 @@ +def convert_torch_dtype(dtype): + import torch + table = { + torch.int32: 'int32', + torch.int64: 'int64', + torch.float32: 'float32', + torch.float64: 'float64' + } + if dtype not in table: + raise NotImplementedError(f'Unsupport torch.Tensor dtype {dtype}') + + return table[dtype] + + +class TensorSpec(object): + def __init__(self, shape, dtype): + self.shape = tuple(shape) + self.dtype = dtype + + @classmethod + def from_tensor(cls, tensor): + import torch + assert isinstance(tensor, torch.Tensor) + return cls(shape=tuple(tensor.shape), dtype=convert_torch_dtype(tensor.dtype)) + + def __str__(self): + return str(self.shape) + ', ' + self.dtype + + def __repr__(self): + return f'TensorSpec({str(self)})' diff --git a/python/matx/toolchain.py b/python/matx/toolchain.py index 9018fbce..96519f9b 100644 --- a/python/matx/toolchain.py +++ b/python/matx/toolchain.py @@ -252,6 +252,26 @@ def path_prefix(sc_ctx: context.ScriptContext): cache_md5)) +def path_prefix_inductor(sc_ctx: context.ScriptContext): + """inductor path_prefix encodes meta info from example_inputs""" + # mkdir LIB_PATH + from .__init__ import __version__ + _mk_lib_dir() + # code + sha1(libmatx.so) + commit_id(__version__) + dep_source_codes = "".join(dep_node.span.source_code for dep_node in sc_ctx.deps_node) + assert isinstance(sc_ctx.main_node.context, context.InductorContext) + example_inputs = sc_ctx.main_node.context.example_inputs_spec + example_inputs_str = ''.join([str(inputs) for inputs in example_inputs]) + cache_str = sc_ctx.main_node.span.source_code + dep_source_codes + example_inputs_str + _LIB_SHA1 + __version__ + cache_md5 = hashlib.md5(cache_str.encode()).hexdigest()[:16] + file_name = os.path.splitext(os.path.basename(sc_ctx.main_node.span.file_name))[0] + return os.path.abspath('{}/lib{}_{}_{}_plugin_{}'.format(LIB_PATH, + file_name, + sc_ctx.main_node.span.lineno, + sc_ctx.main_node.context.name, + cache_md5)) + + def toolchain_path_prefix(sc_ctx: context.ScriptContext, toolchain_str: str): from .__init__ import __version__ # mkdir LIB_PATH @@ -297,10 +317,13 @@ def toolchain_build(sc_ctx: context.ScriptContext, toolchain: ToolChain): sc_ctx.dso_path = (sc_ctx.dso_path[0], so_path) -def build_dso(sc_ctx: context.ScriptContext, use_toolchain=False, compile_options=None): +def build_dso(sc_ctx: context.ScriptContext, use_toolchain=False, compile_options=None, make_path_prefix=None): rt_mod = sc_ctx.rt_module main_node_name = sc_ctx.main_node.context.name - base_path = path_prefix(sc_ctx) + if make_path_prefix is None: + make_path_prefix = path_prefix + + base_path = make_path_prefix(sc_ctx) with contrib.util.filelock(base_path): sopath = base_path + '.so' @@ -389,7 +412,7 @@ def inductor(compiling_obj, example_inputs, *, share=True, toolchain=None, bundl if DISABLE_SCRIPT: return compiling_obj - from .inductor import from_source + from .script.inductor import from_source result: context.ScriptContext = from_source(compiling_obj, example_inputs) @@ -408,7 +431,8 @@ def inductor(compiling_obj, example_inputs, *, share=True, toolchain=None, bundl # TODO: fix this on macOS m1. torch_compiler_options.remove('-lgomp') - build_dso(result, toolchain is not None, compile_options=torch_compiler_options) + build_dso(result, toolchain is not None, compile_options=torch_compiler_options, + make_path_prefix=path_prefix_inductor) if toolchain is not None: toolchain_build(result, toolchain) diff --git a/test/inductor/test_basic.py b/test/inductor/test_basic.py index 5373a6af..f28f981d 100644 --- a/test/inductor/test_basic.py +++ b/test/inductor/test_basic.py @@ -26,10 +26,6 @@ class BasicTests(unittest.TestCase): def test_basics(self): - # TODO: fix cache_hit issues. - from matx import toolchain - toolchain.USE_SO_CACHE = False - def add_relu(a, b): c = a + b c = torch.nn.functional.relu(c) @@ -46,22 +42,15 @@ def add_relu(a, b): example_inputs = [torch.from_numpy(np.random.randn(*size).astype(dtype)), torch.from_numpy(np.random.randn(*size).astype(dtype))] - add_relu_kernel = matx.inductor_script(example_inputs)(add_relu) + add_relu_kernel = matx.inductor(example_inputs)(add_relu) a_tensor = torch.from_numpy(a_numpy) b_tensor = torch.from_numpy(b_numpy) c_tensor_expected = add_relu(a_tensor, b_tensor)[0] c_tensor = add_relu_kernel(a_tensor, b_tensor)[0] - - # TODO: there seems a strange cache behavior of JITOp, without the - # following line, it fails. - del add_relu_kernel - torch.testing.assert_close(c_tensor_expected, c_tensor) - toolchain.USE_SO_CACHE = True - if __name__ == '__main__': unittest.main() From f6bc9c5bf18b367825b3d0130a4079412599cb8a Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Mon, 6 Feb 2023 12:31:35 +0800 Subject: [PATCH 12/21] fix py codestyle --- python/matx/toolchain.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/matx/toolchain.py b/python/matx/toolchain.py index 96519f9b..e01b2299 100644 --- a/python/matx/toolchain.py +++ b/python/matx/toolchain.py @@ -262,7 +262,8 @@ def path_prefix_inductor(sc_ctx: context.ScriptContext): assert isinstance(sc_ctx.main_node.context, context.InductorContext) example_inputs = sc_ctx.main_node.context.example_inputs_spec example_inputs_str = ''.join([str(inputs) for inputs in example_inputs]) - cache_str = sc_ctx.main_node.span.source_code + dep_source_codes + example_inputs_str + _LIB_SHA1 + __version__ + cache_str = sc_ctx.main_node.span.source_code + \ + dep_source_codes + example_inputs_str + _LIB_SHA1 + __version__ cache_md5 = hashlib.md5(cache_str.encode()).hexdigest()[:16] file_name = os.path.splitext(os.path.basename(sc_ctx.main_node.span.file_name))[0] return os.path.abspath('{}/lib{}_{}_{}_plugin_{}'.format(LIB_PATH, @@ -317,7 +318,10 @@ def toolchain_build(sc_ctx: context.ScriptContext, toolchain: ToolChain): sc_ctx.dso_path = (sc_ctx.dso_path[0], so_path) -def build_dso(sc_ctx: context.ScriptContext, use_toolchain=False, compile_options=None, make_path_prefix=None): +def build_dso(sc_ctx: context.ScriptContext, + use_toolchain=False, + compile_options=None, + make_path_prefix=None): rt_mod = sc_ctx.rt_module main_node_name = sc_ctx.main_node.context.name if make_path_prefix is None: From 54bde844ef092817cac321bceb5061cccdbd1017 Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Mon, 6 Feb 2023 12:31:51 +0800 Subject: [PATCH 13/21] fix py codestyle --- python/matx/toolchain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/matx/toolchain.py b/python/matx/toolchain.py index e01b2299..422a0796 100644 --- a/python/matx/toolchain.py +++ b/python/matx/toolchain.py @@ -263,7 +263,7 @@ def path_prefix_inductor(sc_ctx: context.ScriptContext): example_inputs = sc_ctx.main_node.context.example_inputs_spec example_inputs_str = ''.join([str(inputs) for inputs in example_inputs]) cache_str = sc_ctx.main_node.span.source_code + \ - dep_source_codes + example_inputs_str + _LIB_SHA1 + __version__ + dep_source_codes + example_inputs_str + _LIB_SHA1 + __version__ cache_md5 = hashlib.md5(cache_str.encode()).hexdigest()[:16] file_name = os.path.splitext(os.path.basename(sc_ctx.main_node.span.file_name))[0] return os.path.abspath('{}/lib{}_{}_{}_plugin_{}'.format(LIB_PATH, From 54bc1c004964abd6da9beec7ca9b387df05e5c8c Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Mon, 6 Feb 2023 13:11:09 +0800 Subject: [PATCH 14/21] add inductor to __all__ in matx.__init__ --- python/matx/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/matx/__init__.py b/python/matx/__init__.py index 697c6c19..63dddd4a 100644 --- a/python/matx/__init__.py +++ b/python/matx/__init__.py @@ -40,6 +40,7 @@ "trace", "script", "script_embedded_class", + "inductor", "save", "load", "get_cflags", From d8141131c3e7eb48fdf32442a57feb6adcbe98fc Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Mon, 6 Feb 2023 13:13:36 +0800 Subject: [PATCH 15/21] remove TODO in python/matx/pipeline/_register_conveter.py --- python/matx/pipeline/_register_conveter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/matx/pipeline/_register_conveter.py b/python/matx/pipeline/_register_conveter.py index c28e7246..7fca3493 100644 --- a/python/matx/pipeline/_register_conveter.py +++ b/python/matx/pipeline/_register_conveter.py @@ -19,7 +19,6 @@ try: - # TODO: consider lazy import this after users called matx.inductor_script import torch import torch.utils.dlpack From a4c590ca80fec4e26407b4c86cb0a3a0091e5368 Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Mon, 6 Feb 2023 13:20:57 +0800 Subject: [PATCH 16/21] add LICENSE --- python/matx/script/inductor/tensor_spec.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/python/matx/script/inductor/tensor_spec.py b/python/matx/script/inductor/tensor_spec.py index bdee4a73..40a229fc 100644 --- a/python/matx/script/inductor/tensor_spec.py +++ b/python/matx/script/inductor/tensor_spec.py @@ -1,3 +1,22 @@ +# Copyright 2022 ByteDance Ltd. and/or its affiliates. +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + def convert_torch_dtype(dtype): import torch table = { From 47cb1097d127c962c43212da798af865ae48d831 Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Mon, 6 Feb 2023 15:46:16 +0800 Subject: [PATCH 17/21] add inductor ci --- .github/workflows/test_py_inductor.yml | 26 ++++++++++++++ ci/run_py_inductor_test.sh | 50 ++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100644 .github/workflows/test_py_inductor.yml create mode 100644 ci/run_py_inductor_test.sh diff --git a/.github/workflows/test_py_inductor.yml b/.github/workflows/test_py_inductor.yml new file mode 100644 index 00000000..b671d168 --- /dev/null +++ b/.github/workflows/test_py_inductor.yml @@ -0,0 +1,26 @@ +name: Test Inductor + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: '3.8' + - name: Prepare PyTorch 2.0 nightly + run: pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu + - name: Echo GCC version + run: gcc --version + - name: Install MATXScript Requirements + run: pip3 install -r python/requirements.txt + - name: PyTorch Extension Test + run: bash ci/run_py_inductor_test.sh diff --git a/ci/run_py_inductor_test.sh b/ci/run_py_inductor_test.sh new file mode 100644 index 00000000..14a68c0f --- /dev/null +++ b/ci/run_py_inductor_test.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Copyright 2022 ByteDance Ltd. and/or its affiliates. +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -xue +set -o pipefail + +THIS_PATH=$(cd $(dirname "$0"); pwd) +ROOT_PATH=${THIS_PATH}/../ + +############################################################################### +# build all shared target +############################################################################### +cd "${ROOT_PATH}" || exit 1 +BUILD_TESTING=OFF BUILD_BENCHMARK=OFF CPPFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" bash ci/build_lib.sh + +############################################################################### +# install requirements +############################################################################### +PYTHON_MODULE_PATH=${ROOT_PATH}/python +cd "${PYTHON_MODULE_PATH}" +pip3 install -r requirements.txt + +############################################################################### +# find all test script +############################################################################### +PYTHONPATH=${PYTHONPATH:-} +TEST_SCRIPT_PATH=${ROOT_PATH}/test/inductor +cd "${TEST_SCRIPT_PATH}" +# shellcheck disable=SC2045 +for script_file in $(ls test_*.py); do + echo "test script: ${script_file}" + PYTHONPATH="${ROOT_PATH}/python:${PYTHONPATH}" python3 "${script_file}" +done From 43de4248ad20690018695884350c6a5d3bd7d590 Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Mon, 6 Feb 2023 16:08:07 +0800 Subject: [PATCH 18/21] fix Pytorch version check --- .github/workflows/test_py_inductor.yml | 4 ++-- python/matx/torch_compiler/__init__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_py_inductor.yml b/.github/workflows/test_py_inductor.yml index b671d168..72d8436e 100644 --- a/.github/workflows/test_py_inductor.yml +++ b/.github/workflows/test_py_inductor.yml @@ -1,4 +1,4 @@ -name: Test Inductor +name: Test Python Inductor on: push: @@ -22,5 +22,5 @@ jobs: run: gcc --version - name: Install MATXScript Requirements run: pip3 install -r python/requirements.txt - - name: PyTorch Extension Test + - name: Python Inductor Test run: bash ci/run_py_inductor_test.sh diff --git a/python/matx/torch_compiler/__init__.py b/python/matx/torch_compiler/__init__.py index 2fe03049..97fcd247 100644 --- a/python/matx/torch_compiler/__init__.py +++ b/python/matx/torch_compiler/__init__.py @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -minimum_torch_version = '2.0.0a0' +minimum_torch_version = '2.0.0.dev' try: import torch From 032da39e5b5ff6010f4715e8c800c2dc7e199121 Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Mon, 6 Feb 2023 16:47:18 +0800 Subject: [PATCH 19/21] update torch_compiler to match nightly 20230205 --- .github/workflows/test_py_inductor.yml | 2 +- python/matx/toolchain.py | 7 ++-- .../codegen/inductor/__init__.py | 32 ++++++++++++------- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/.github/workflows/test_py_inductor.yml b/.github/workflows/test_py_inductor.yml index 72d8436e..adc4c20b 100644 --- a/.github/workflows/test_py_inductor.yml +++ b/.github/workflows/test_py_inductor.yml @@ -17,7 +17,7 @@ jobs: with: python-version: '3.8' - name: Prepare PyTorch 2.0 nightly - run: pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu + run: pip3 install --pre torch==2.0.0.dev20230205 --index-url https://download.pytorch.org/whl/nightly/cpu - name: Echo GCC version run: gcc --version - name: Install MATXScript Requirements diff --git a/python/matx/toolchain.py b/python/matx/toolchain.py index 422a0796..7c8724fc 100644 --- a/python/matx/toolchain.py +++ b/python/matx/toolchain.py @@ -262,8 +262,8 @@ def path_prefix_inductor(sc_ctx: context.ScriptContext): assert isinstance(sc_ctx.main_node.context, context.InductorContext) example_inputs = sc_ctx.main_node.context.example_inputs_spec example_inputs_str = ''.join([str(inputs) for inputs in example_inputs]) - cache_str = sc_ctx.main_node.span.source_code + \ - dep_source_codes + example_inputs_str + _LIB_SHA1 + __version__ + cache_str = sc_ctx.main_node.span.source_code + dep_source_codes + cache_str += example_inputs_str + _LIB_SHA1 + __version__ cache_md5 = hashlib.md5(cache_str.encode()).hexdigest()[:16] file_name = os.path.splitext(os.path.basename(sc_ctx.main_node.span.file_name))[0] return os.path.abspath('{}/lib{}_{}_{}_plugin_{}'.format(LIB_PATH, @@ -433,7 +433,8 @@ def inductor(compiling_obj, example_inputs, *, share=True, toolchain=None, bundl torch_compiler_options = ipaths.split() + lpaths.split() + libs.split() + macros.split() # TODO: fix this on macOS m1. - torch_compiler_options.remove('-lgomp') + if '-lomp' in torch_compiler_options: + torch_compiler_options.remove('-lomp') build_dso(result, toolchain is not None, compile_options=torch_compiler_options, make_path_prefix=path_prefix_inductor) diff --git a/python/matx/torch_compiler/codegen/inductor/__init__.py b/python/matx/torch_compiler/codegen/inductor/__init__.py index a85e6c16..cc06f675 100644 --- a/python/matx/torch_compiler/codegen/inductor/__init__.py +++ b/python/matx/torch_compiler/codegen/inductor/__init__.py @@ -57,20 +57,28 @@ def compile_fx_inner_cpu( # lift the maximum depth of the Python interpreter stack # to adapt large/deep models compile_fx.sys.setrecursionlimit(max(compile_fx.sys.getrecursionlimit(), 2000)) + V.debug.fx_graph(gm, example_inputs) + shape_env = compile_fx._shape_env_from_inputs(example_inputs) - fake_mode = compile_fx.fake_mode_from_tensors(example_inputs) - graph = compile_fx.GraphLowering( - gm, - shape_env=shape_env, - num_static_inputs=num_fixed, - graph_id=graph_id, - fake_mode=fake_mode, - ) - with V.set_graph_handler(graph): - graph.run(*example_inputs) - code = graph.codegen() - fake_callable.set_code(code) + fake_mode = compile_fx.fake_mode_from_tensors( + example_inputs + ) or torch._subclasses.FakeTensorMode(allow_non_fake_inputs=True) + + with V.set_fake_mode(fake_mode): + compile_fx.pattern_matcher.fx_passes(gm) + V.debug.fx_graph_transformed(gm, example_inputs) + + graph = compile_fx.GraphLowering( + gm, + shape_env=shape_env, + num_static_inputs=num_fixed, + graph_id=graph_id, + ) + with V.set_graph_handler(graph): + graph.run(*example_inputs) + code = graph.codegen() + fake_callable.set_code(code) return fake_callable From f75d9501cc267c73dfc803cf57d0c9bfdd1fd10e Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Mon, 6 Feb 2023 23:01:37 +0800 Subject: [PATCH 20/21] fix linux compilation bug --- python/matx/toolchain.py | 14 +++++++------- .../matx/torch_compiler/codegen/matx_formatter.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/matx/toolchain.py b/python/matx/toolchain.py index 7c8724fc..42c0885e 100644 --- a/python/matx/toolchain.py +++ b/python/matx/toolchain.py @@ -421,20 +421,20 @@ def inductor(compiling_obj, example_inputs, *, share=True, toolchain=None, bundl result: context.ScriptContext = from_source(compiling_obj, example_inputs) from torch._inductor import codecache - ipaths, lpaths, libs, macros = codecache.get_include_and_linking_paths(include_pytorch=False) + ipaths, lpaths, libs, macros = codecache.get_include_and_linking_paths(include_pytorch=False, + vec_isa=codecache.pick_vec_isa()) # TODO: check whether the following flags are handled by common flags # codecache.get_shared() - # codecache.optimization_flags() + optimization_flag = codecache.optimization_flags() # codecache.cpp_flags() # codecache.get_warning_all_flag() # codecache.use_custom_generated_macros() - torch_compiler_options = ipaths.split() + lpaths.split() + libs.split() + macros.split() - - # TODO: fix this on macOS m1. - if '-lomp' in torch_compiler_options: - torch_compiler_options.remove('-lomp') + torch_compiler_options = [] + flag_str_lst = [ipaths, lpaths, libs, macros, optimization_flag] + for flag_str in flag_str_lst: + torch_compiler_options.extend(flag_str.split()) build_dso(result, toolchain is not None, compile_options=torch_compiler_options, make_path_prefix=path_prefix_inductor) diff --git a/python/matx/torch_compiler/codegen/matx_formatter.py b/python/matx/torch_compiler/codegen/matx_formatter.py index f583c705..c30fb2ea 100644 --- a/python/matx/torch_compiler/codegen/matx_formatter.py +++ b/python/matx/torch_compiler/codegen/matx_formatter.py @@ -283,7 +283,7 @@ def matx_cpp_code_format(code: str, kernel_name: str, include_code_str, kernel_code_str = split_include_kernel(code) # add matx include - include_code_str = MATX_INCLUDE + include_code_str += MATX_INCLUDE # extract kernel declaration kernel_declaration_str, kernel_body_str = split_declaration_body(kernel_code_str) From e5115e7ff0bbf053e08ddd1d16db0df3c6c10998 Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Mon, 6 Feb 2023 23:05:52 +0800 Subject: [PATCH 21/21] fix py codestyle --- python/matx/toolchain.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/matx/toolchain.py b/python/matx/toolchain.py index 42c0885e..b4185c06 100644 --- a/python/matx/toolchain.py +++ b/python/matx/toolchain.py @@ -421,8 +421,8 @@ def inductor(compiling_obj, example_inputs, *, share=True, toolchain=None, bundl result: context.ScriptContext = from_source(compiling_obj, example_inputs) from torch._inductor import codecache - ipaths, lpaths, libs, macros = codecache.get_include_and_linking_paths(include_pytorch=False, - vec_isa=codecache.pick_vec_isa()) + ipaths, lpaths, libs, macros = codecache.get_include_and_linking_paths( + include_pytorch=False, vec_isa=codecache.pick_vec_isa()) # TODO: check whether the following flags are handled by common flags # codecache.get_shared()