From b016f4e191f58c078a79a486ddee9efaef1cfd14 Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Mon, 16 Jan 2023 19:09:26 +0800
Subject: [PATCH 01/21] start matx.inductor

---
 python/matx/script/context/inductor_context.py | 18 ++++++++++++++++++
 python/matx/toolchain.py                       |  4 ++++
 2 files changed, 22 insertions(+)
 create mode 100644 python/matx/script/context/inductor_context.py

diff --git a/python/matx/script/context/inductor_context.py b/python/matx/script/context/inductor_context.py
new file mode 100644
index 00000000..84bf20b4
--- /dev/null
+++ b/python/matx/script/context/inductor_context.py
@@ -0,0 +1,18 @@
+# Copyright 2022 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
\ No newline at end of file
diff --git a/python/matx/toolchain.py b/python/matx/toolchain.py
index 8758f557..556ba4fe 100644
--- a/python/matx/toolchain.py
+++ b/python/matx/toolchain.py
@@ -380,6 +380,10 @@ def script(compiling_obj, *, share=True, toolchain=None, bundle_args=None):
         raise ValueError('Unsupported build_type: {}'.format(result.build_type))
 
 
+def inductor(compiling_obj, *, share=True, toolchain=None, bundle_args=None):
+    pass
+
+
 def make_session(compiling_obj, method='__call__'):
     from . import pipeline
 

From bb78f4514a90b0d20d90b6e9df6f7f66145abfec Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Tue, 17 Jan 2023 21:35:30 +0800
Subject: [PATCH 02/21] almost finish inductor mvp

---
 python/matx/__init__.py                       |  5 ++-
 python/matx/contrib/cc.py                     | 11 ++++-
 python/matx/inductor/__init__.py              | 45 +++++++++++++++++++
 python/matx/inductor/context/__init__.py      |  0
 python/matx/runtime/module.py                 |  2 +
 python/matx/script/context/__init__.py        |  1 +
 python/matx/script/context/ast_node.py        |  3 +-
 .../matx/script/context/inductor_context.py   | 12 ++++-
 python/matx/toolchain.py                      | 43 ++++++++++++++++--
 9 files changed, 114 insertions(+), 8 deletions(-)
 create mode 100644 python/matx/inductor/__init__.py
 create mode 100644 python/matx/inductor/context/__init__.py

diff --git a/python/matx/__init__.py b/python/matx/__init__.py
index 4870a959..5c84b691 100644
--- a/python/matx/__init__.py
+++ b/python/matx/__init__.py
@@ -30,7 +30,6 @@
 from . import vision
 from . import tools
 
-
 # APIs
 __all__ = [
     # functions
@@ -352,6 +351,10 @@ def script(compiling_obj, *args, backend=None, **kwargs):
         return toolchain.script(compiling_obj, *args, **kwargs)
 
 
+def inductor(compiling_obj, example_inputs, **kwargs):
+    return toolchain.inductor(compiling_obj, example_inputs, **kwargs)
+
+
 def script_embedded_class(code, is_path=False):
     return toolchain.script_embedded_class(code, is_path)
 
diff --git a/python/matx/contrib/cc.py b/python/matx/contrib/cc.py
index bd45d695..4bed4eeb 100644
--- a/python/matx/contrib/cc.py
+++ b/python/matx/contrib/cc.py
@@ -93,9 +93,16 @@ def find_sys_cc_path():
         raise RuntimeError("win32 is not supported")
     elif sys.platform.startswith('darwin'):
         # maybe we can use clang++
-        cc_bin = "g++"
+        # prioritized compiler defined in CXX
+        if 'CXX' in os.environ:
+            cc_bin = os.environ['CXX']
+        else:
+            cc_bin = "g++"
     else:
-        cc_bin = "g++"
+        if 'CXX' in os.environ:
+            cc_bin = os.environ['CXX']
+        else:
+            cc_bin = "g++"
     return cc_bin
 
 
diff --git a/python/matx/inductor/__init__.py b/python/matx/inductor/__init__.py
new file mode 100644
index 00000000..751e5d52
--- /dev/null
+++ b/python/matx/inductor/__init__.py
@@ -0,0 +1,45 @@
+import inspect
+from typing import List
+
+import torch
+from torch_compiler.manual_codegen import extract_inductor_code, matx_cpp_code_format
+
+from matx.env import MATX_DEV_MODE
+from matx.script import context
+from matx.toolchain import path_prefix
+
+
+def from_source(compiling_obj: type, example_inputs: List[torch.Tensor]) -> context.ScriptContext:
+    try:
+
+        code = extract_inductor_code(compiling_obj, example_inputs)
+        code = matx_cpp_code_format(code)
+
+        sc_ctx = context.ScriptContext()
+        sc_ctx.build_type = context.BuildType.FUNCTION
+        sc_ctx.main_node.raw = compiling_obj
+        # set sc_ctx attributes to be compatible with existing matx code
+        inductor_context = context.InductorContext(fn_name=compiling_obj.__name__)
+        sc_ctx.main_node.context = inductor_context
+        # set source code TODO: formatting source code
+        sc_ctx.main_node.span.source_code = inspect.getsource(compiling_obj)
+        # set filename. TODO: this is too hack
+        frame = inspect.stack()[3]
+        sc_ctx.main_node.span.file_name = frame[0].f_code.co_filename
+
+        # export code
+        path = path_prefix(sc_ctx)
+        with open(path, 'w') as f:
+            f.write(code)
+
+        # set rt_module
+        from .. import _ffi
+        build_module = _ffi.get_global_func("embedded.build.c")
+        sc_ctx.rt_module = build_module(code.encode())
+
+        return sc_ctx
+    except BaseException as e:
+        if MATX_DEV_MODE:
+            raise
+        else:
+            raise Exception(str(e)) from None
diff --git a/python/matx/inductor/context/__init__.py b/python/matx/inductor/context/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/matx/runtime/module.py b/python/matx/runtime/module.py
index e57caa28..11aa736b 100644
--- a/python/matx/runtime/module.py
+++ b/python/matx/runtime/module.py
@@ -210,6 +210,8 @@ def export_library(self, file_name, fcompile=None, addons=None, **kwargs):
 
         assert self.type_key == "c"
 
+        breakpoint()
+
         modules = self._collect_dso_modules()
         files = addons if addons else []
         is_system_lib = False
diff --git a/python/matx/script/context/__init__.py b/python/matx/script/context/__init__.py
index 342af971..896630ad 100644
--- a/python/matx/script/context/__init__.py
+++ b/python/matx/script/context/__init__.py
@@ -23,3 +23,4 @@
 from .class_context import ClassContext, GetClassAttr
 from .function_context import FunctionContext, FunctionType
 from .scope_context import ScopeContext
+from .inductor_context import InductorContext
diff --git a/python/matx/script/context/ast_node.py b/python/matx/script/context/ast_node.py
index 70ee5c57..55b2c418 100644
--- a/python/matx/script/context/ast_node.py
+++ b/python/matx/script/context/ast_node.py
@@ -22,6 +22,7 @@
 from matx._typed_ast import ast
 from .class_context import ClassContext
 from .function_context import FunctionContext
+from .inductor_context import InductorContext
 from ... import ir as _ir
 
 
@@ -49,7 +50,7 @@ def __init__(self, ):
         self.raw: Optional[type] = None
         self.span: Span = Span()
         self.ast: Optional[ast.AST] = None
-        self.context: Union[ClassContext, FunctionContext, None] = None
+        self.context: Union[ClassContext, FunctionContext, InductorContext, None] = None
         self.module: Optional[ModuleInfo] = None
         self.deps: Optional[List[ASTNode]] = None
         self.ir_schema = None
diff --git a/python/matx/script/context/inductor_context.py b/python/matx/script/context/inductor_context.py
index 84bf20b4..dbdb434e 100644
--- a/python/matx/script/context/inductor_context.py
+++ b/python/matx/script/context/inductor_context.py
@@ -15,4 +15,14 @@
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
-# under the License.
\ No newline at end of file
+# under the License.
+
+
+class InductorContext(object):
+    def __init__(self,
+                 fn_name: str = '<unknown>', ):
+        self.fn_name = fn_name
+
+    @property
+    def name(self):
+        return self.fn_name
diff --git a/python/matx/toolchain.py b/python/matx/toolchain.py
index 556ba4fe..bd946627 100644
--- a/python/matx/toolchain.py
+++ b/python/matx/toolchain.py
@@ -40,6 +40,7 @@
 USE_SO_CACHE = os.environ.get('MATX_USE_SO_CACHE', '').lower() != 'false'
 
 DISABLE_SCRIPT = os.environ.get('MATX_DISABLE_SCRIPT', '').lower() == 'true'
+DISABLE_INDUCTOR = os.environ.get('MATX_DISABLE_INDUCTOR', '').lower() == 'true'
 DISABLE_GENERATE_CC = os.environ.get('MATX_DISABLE_GENERATE_CC', '').lower() == 'true'
 FLAG_COMPILED_OBJECT = object()
 
@@ -296,7 +297,7 @@ def toolchain_build(sc_ctx: context.ScriptContext, toolchain: ToolChain):
         sc_ctx.dso_path = (sc_ctx.dso_path[0], so_path)
 
 
-def build_dso(sc_ctx: context.ScriptContext, use_toolchain=False):
+def build_dso(sc_ctx: context.ScriptContext, use_toolchain=False, compile_options=None):
     rt_mod = sc_ctx.rt_module
     main_node_name = sc_ctx.main_node.context.name
     base_path = path_prefix(sc_ctx)
@@ -305,12 +306,16 @@ def build_dso(sc_ctx: context.ScriptContext, use_toolchain=False):
         sopath = base_path + '.so'
         sopath_cxx11 = base_path + '_cxx11.so'
 
+        # TODO: need to unify the compile options
         base_options = [
             "-std=c++14",
             "-O3",
             "-g",
             "-fdiagnostics-color=always",
             "-Werror=return-type"]
+        if compile_options is not None:
+            assert isinstance(compile_options, List)
+            base_options.extend(compile_options)
         cxx11_with_abi_options = base_options + ["-D_GLIBCXX_USE_CXX11_ABI=1"]
         cxx11_no_abi_options = base_options + ["-D_GLIBCXX_USE_CXX11_ABI=0"]
         sys_cc_path = contrib.cc.find_sys_cc_path()
@@ -380,8 +385,40 @@ def script(compiling_obj, *, share=True, toolchain=None, bundle_args=None):
         raise ValueError('Unsupported build_type: {}'.format(result.build_type))
 
 
-def inductor(compiling_obj, *, share=True, toolchain=None, bundle_args=None):
-    pass
+def inductor(compiling_obj, example_inputs, *, share=True, toolchain=None, bundle_args=None):
+    if DISABLE_SCRIPT:
+        return compiling_obj
+
+    from matx.inductor import from_source
+
+    result: context.ScriptContext = from_source(compiling_obj, example_inputs)
+
+    # TODO: get Pytorch additional compiler flags. Hardcode here for mvp
+    torch_compiler_options = [
+        '-I/Users/bytedance/miniforge3/envs/inductor/lib/python3.10/site-packages/torch/include',
+        '-I/Users/bytedance/miniforge3/envs/inductor/lib/python3.10/site-packages/torch/include/torch/csrc/api/include',
+        '-I/Users/bytedance/miniforge3/envs/inductor/lib/python3.10/site-packages/torch/include/TH',
+        '-I/Users/bytedance/miniforge3/envs/inductor/lib/python3.10/site-packages/torch/include/THC',
+        '-I/Users/bytedance/miniforge3/envs/inductor/include/python3.10',
+        '-lgomp',
+        '-march=native',
+        '-ffast-math',
+        '-fno-finite-math-only',
+        '-fopenmp',
+        '-DC10_USING_CUSTOM_GENERATED_MACROS'
+    ]
+
+
+    build_dso(result, toolchain is not None, compile_options=torch_compiler_options)
+    if toolchain is not None:
+        toolchain_build(result, toolchain)
+
+    if result.build_type is context.BuildType.FUNCTION:
+        return make_jit_op_creator(result, share, bundle_args=bundle_args)()
+    elif result.build_type is context.BuildType.JIT_OBJECT:
+        return make_jit_object_creator(result, share, bundle_args=bundle_args)
+    else:
+        raise ValueError('Unsupported build_type: {}'.format(result.build_type))
 
 
 def make_session(compiling_obj, method='__call__'):

From 9bf0c3198d3a51668f1c55b3975e92d944887824 Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Wed, 18 Jan 2023 11:59:35 +0800
Subject: [PATCH 03/21] matx.inductor mvp example works

---
 python/matx/inductor/__init__.py               | 7 +++++++
 python/matx/runtime/module.py                  | 2 --
 python/matx/script/context/inductor_context.py | 3 +++
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/python/matx/inductor/__init__.py b/python/matx/inductor/__init__.py
index 751e5d52..ac50930a 100644
--- a/python/matx/inductor/__init__.py
+++ b/python/matx/inductor/__init__.py
@@ -37,6 +37,13 @@ def from_source(compiling_obj: type, example_inputs: List[torch.Tensor]) -> cont
         build_module = _ffi.get_global_func("embedded.build.c")
         sc_ctx.rt_module = build_module(code.encode())
 
+        # set args types. # TODO: hardcode for now
+        from .. import ir
+        sc_ctx.main_node.context.arg_types = dict(
+            a=ir.type.NDArrayType(),
+            b=ir.type.NDArrayType()
+        )
+
         return sc_ctx
     except BaseException as e:
         if MATX_DEV_MODE:
diff --git a/python/matx/runtime/module.py b/python/matx/runtime/module.py
index 11aa736b..e57caa28 100644
--- a/python/matx/runtime/module.py
+++ b/python/matx/runtime/module.py
@@ -210,8 +210,6 @@ def export_library(self, file_name, fcompile=None, addons=None, **kwargs):
 
         assert self.type_key == "c"
 
-        breakpoint()
-
         modules = self._collect_dso_modules()
         files = addons if addons else []
         is_system_lib = False
diff --git a/python/matx/script/context/inductor_context.py b/python/matx/script/context/inductor_context.py
index dbdb434e..ec775a1f 100644
--- a/python/matx/script/context/inductor_context.py
+++ b/python/matx/script/context/inductor_context.py
@@ -22,6 +22,9 @@ class InductorContext(object):
     def __init__(self,
                  fn_name: str = '<unknown>', ):
         self.fn_name = fn_name
+        self.unbound_name = fn_name
+        self.return_type = None
+        self.arg_types = {}  # Deferred?
 
     @property
     def name(self):

From 3439aea6c2975b1c67f9bf06b9b3bdfd5317b0af Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Wed, 18 Jan 2023 12:08:31 +0800
Subject: [PATCH 04/21] add inductor demo

---
 examples/inductor/simple_inductor.py | 66 ++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 examples/inductor/simple_inductor.py

diff --git a/examples/inductor/simple_inductor.py b/examples/inductor/simple_inductor.py
new file mode 100644
index 00000000..a4ef44ff
--- /dev/null
+++ b/examples/inductor/simple_inductor.py
@@ -0,0 +1,66 @@
+import matx
+import torch
+import numpy as np
+import json
+
+
+def kernel(a: matx.NDArray, b: matx.NDArray):
+    c = a + b
+    c = torch.nn.functional.relu(c)
+    return c,
+
+
+add_kernel = matx.inductor(kernel, example_inputs=[
+    torch.randn(5),
+    torch.randn(5)
+])
+
+
+@matx.script
+def add_json(a: str, b: str) -> str:
+    """
+    Assume a and b is a json containing 10 digits. We would like to add them and return another json
+    """
+    a_list = json.loads(a)
+    b_list = json.loads(b)
+
+    a_tensor = matx.NDArray(arr=a_list, shape=[5], dtype='float32')
+    b_tensor = matx.NDArray(arr=b_list, shape=[5], dtype='float32')
+    c_tensor = matx.NDArray(arr=b_list, shape=[5], dtype='float32')
+
+    add_kernel(a_tensor, b_tensor, c_tensor)
+
+    result_lst = c_tensor.tolist()
+
+    return json.dumps(result_lst)
+
+
+if __name__ == '__main__':
+    print(f'Pytorch version {torch.__version__}')
+
+    a_np = np.random.randn(5).astype(np.float32)
+    b_np = np.random.randn(5).astype(np.float32)
+    c_np = np.random.randn(5).astype(np.float32)
+
+    a = matx.NDArray([], a_np.shape, str(a_np.dtype))
+    a.from_numpy(a_np)
+
+    b = matx.NDArray([], b_np.shape, str(b_np.dtype))
+    b.from_numpy(b_np)
+
+    c = matx.NDArray([], c_np.shape, str(c_np.dtype))
+    c.from_numpy(c_np)
+
+    print(a)
+    print(b)
+    print(c)
+
+    print(kernel(a.torch(), b.torch()))
+
+    d = add_kernel(a, b, c)
+    print(c)
+
+    a = json.dumps([1, 2, 3, 4, 5])
+    b = json.dumps([6, 7, 8, 9, 10])
+    result = add_json(a, b)
+    print(result)

From 0dc8280348d258e08cca0acabf307d72c0684c0c Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Tue, 31 Jan 2023 15:22:44 +0800
Subject: [PATCH 05/21] move torch_compiler inside matx. Currently, depends on
 Pytorch 2.0

---
 python/matx/__init__.py                       |  16 +-
 python/matx/inductor/__init__.py              |   2 +-
 python/matx/torch_compiler/__init__.py        |  13 +
 python/matx/torch_compiler/codegen.py         | 261 ++++++++++++++++++
 .../torch_compiler/tests}/simple_inductor.py  |  36 +--
 python/matx/torch_compiler/utils/__init__.py  |   0
 python/matx/torch_compiler/utils/cpp_parse.py | 141 ++++++++++
 7 files changed, 434 insertions(+), 35 deletions(-)
 create mode 100644 python/matx/torch_compiler/__init__.py
 create mode 100644 python/matx/torch_compiler/codegen.py
 rename {examples/inductor => python/matx/torch_compiler/tests}/simple_inductor.py (56%)
 create mode 100644 python/matx/torch_compiler/utils/__init__.py
 create mode 100644 python/matx/torch_compiler/utils/cpp_parse.py

diff --git a/python/matx/__init__.py b/python/matx/__init__.py
index 5c84b691..78cb0d35 100644
--- a/python/matx/__init__.py
+++ b/python/matx/__init__.py
@@ -351,8 +351,20 @@ def script(compiling_obj, *args, backend=None, **kwargs):
         return toolchain.script(compiling_obj, *args, **kwargs)
 
 
-def inductor(compiling_obj, example_inputs, **kwargs):
-    return toolchain.inductor(compiling_obj, example_inputs, **kwargs)
+def inductor(example_inputs, **kwargs):
+    """
+
+    Args:
+        example_inputs: any nested structure of torch.Tensor that passed into the kernel
+        **kwargs: other keyword arguments passed into toolchain.inductor
+
+    Returns: a wrapper that compiles the compiling_obj into a JIT FUNCTION
+
+    """
+    def inner_inductor(compiling_obj):
+        return toolchain.inductor(compiling_obj, example_inputs, **kwargs)
+
+    return inner_inductor
 
 
 def script_embedded_class(code, is_path=False):
diff --git a/python/matx/inductor/__init__.py b/python/matx/inductor/__init__.py
index ac50930a..b3a61d84 100644
--- a/python/matx/inductor/__init__.py
+++ b/python/matx/inductor/__init__.py
@@ -2,7 +2,7 @@
 from typing import List
 
 import torch
-from torch_compiler.manual_codegen import extract_inductor_code, matx_cpp_code_format
+from matx.torch_compiler.codegen import extract_inductor_code, matx_cpp_code_format
 
 from matx.env import MATX_DEV_MODE
 from matx.script import context
diff --git a/python/matx/torch_compiler/__init__.py b/python/matx/torch_compiler/__init__.py
new file mode 100644
index 00000000..e83ac962
--- /dev/null
+++ b/python/matx/torch_compiler/__init__.py
@@ -0,0 +1,13 @@
+minimum_torch_version = '2.0.0a0'
+
+try:
+    import torch
+
+    assert torch.__version__ >= minimum_torch_version
+
+except ModuleNotFoundError:
+    print(f'torch is not installed. matx.inductor requires torch >= {minimum_torch_version}')
+    raise
+except AssertionError:
+    print(f'matx.inductor requires torch >= {minimum_torch_version}')
+    raise
diff --git a/python/matx/torch_compiler/codegen.py b/python/matx/torch_compiler/codegen.py
new file mode 100644
index 00000000..4d2566c9
--- /dev/null
+++ b/python/matx/torch_compiler/codegen.py
@@ -0,0 +1,261 @@
+import copy
+import logging
+from typing import List
+
+import torch
+import torch._inductor.compile_fx as compile_fx
+from torch import fx
+from torch._inductor.debug import DebugContext
+from torch._inductor.virtualized import V
+
+from .utils import cpp_parse
+
+log = logging.getLogger(__name__)
+
+MATX_INCLUDE = '''
+#include "matxscript/runtime/codegen_all_includes.h"
+#include <math.h>
+
+using namespace ::matxscript::runtime;
+extern "C" void* __matxscript_module_ctx = NULL;
+
+extern "C" MATX_DLL MATXScriptFuncRegistry __matxscript_func_registry__;
+'''
+
+SESSION_HANLDER = cpp_parse.CPPArg(name='handle_2_71828182846',
+                                   type=cpp_parse.CPPType(name='void', is_pointer=True))
+SESSION_HANLDER_WITH_DEAFULT = cpp_parse.CPPArg(name='handle_2_71828182846',
+                                                type=cpp_parse.CPPType(name='void', is_pointer=True),
+                                                default_val='((void*)(int64_t)0)')
+
+
+def generate_ndarray_arg_cast(arg_name, arg_index, dtype, message='TODO'):
+    return f'({dtype}*)internal::TypeAsHelper<NDArray>::run(({arg_name}[{arg_index}]), __FILE__, __LINE__, "{message}", "{message}").Data<{dtype}>()'
+
+
+def get_c_api(kernel_name: str, args: List[cpp_parse.CPPArg], has_return_value) -> str:
+    template_with_return = '''
+int kernel__c_api(MATXScriptAny* args, int num_args, MATXScriptAny* out_ret_value, void* resource_handle = nullptr)
+{{
+  TArgs args_t(args, num_args);
+
+  if (num_args > 0 && args[num_args - 1].code == TypeIndex::kRuntimeKwargs) {{
+    string_view arg_names[{}] {{{}}};
+    KwargsUnpackHelper helper("{}", arg_names, {}, nullptr, 0);
+    RTView pos_args[{}];
+    helper.unpack(pos_args, args, num_args);  // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5
+
+    auto ret = {}({}, 
+                {}resource_handle);
+    RTValue(std::move(ret)).MoveToCHost(out_ret_value);
+  }} else {{
+    switch(num_args) {{
+      case {}: {{
+        auto ret = {}({}, 
+                    {}resource_handle);  // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5
+        RTValue(std::move(ret)).MoveToCHost(out_ret_value);
+      }} break;
+      default: {{THROW_PY_TypeError("TODO");}} break;  // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5
+    }}
+  }}
+
+  return 0;
+}}
+'''
+    template_without_return = '''
+int kernel__c_api(MATXScriptAny* args, int num_args, MATXScriptAny* out_ret_value, void* resource_handle = nullptr)
+{{
+  TArgs args_t(args, num_args);
+
+  if (num_args > 0 && args[num_args - 1].code == TypeIndex::kRuntimeKwargs) {{
+    string_view arg_names[{}] {{{}}};
+    KwargsUnpackHelper helper("{}", arg_names, {}, nullptr, 0);
+    RTView pos_args[{}];
+    helper.unpack(pos_args, args, num_args);  // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5
+
+    {}({}, 
+     {}resource_handle);
+  }} else {{
+    switch(num_args) {{
+      case {}: {{
+        {}({}, 
+         {}resource_handle);  // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5
+        int ret = 1;
+        RTValue(std::move(ret)).MoveToCHost(out_ret_value);
+      }} break;
+      default: {{THROW_PY_TypeError("TODO");}} break;  // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5
+    }}
+  }}
+
+  return 0;
+}}
+'''
+    if has_return_value:
+        template = template_with_return
+    else:
+        template = template_without_return
+
+    num_args = len(args)
+    arg_names_concat_str = ', '.join([f'"{arg.name}"' for arg in args])
+    args_dtype = [arg.type.name for arg in args]
+
+    pos_arg_cast_lst = []
+    args_t_cast_lst = []
+    for arg_index in range(num_args):
+        pos_arg_cast_lst.append(generate_ndarray_arg_cast('pos_args', arg_index, args_dtype[arg_index]))
+        args_t_cast_lst.append(generate_ndarray_arg_cast('args_t', arg_index, args_dtype[arg_index]))
+
+    kernel_name_indentation = len(kernel_name) * ' '
+    if has_return_value:
+        return_name_indentation = ' ' * 11
+    else:
+        return_name_indentation = ''
+    pos_arg_cast_indentation = '\n     ' + kernel_name_indentation + return_name_indentation
+    args_t_cast_indentation = '\n         ' + kernel_name_indentation + return_name_indentation
+    pos_arg_cast = (',' + pos_arg_cast_indentation).join(pos_arg_cast_lst)
+    args_t_cast = (',' + args_t_cast_indentation).join(args_t_cast_lst)
+
+    return template.format(num_args, arg_names_concat_str, kernel_name, num_args, num_args, kernel_name,
+                           pos_arg_cast, kernel_name_indentation, num_args, kernel_name,
+                           args_t_cast, kernel_name_indentation)
+
+
+def get_registration_str(kernel_name):
+    # TODO: currently, only 1 function is here.
+    template = '''
+extern "C" {{
+
+MATX_DLL MATXScriptBackendPackedCFunc __matxscript_func_array__[] = {{
+    (MATXScriptBackendPackedCFunc){}__c_api,
+}};
+MATX_DLL MATXScriptFuncRegistry __matxscript_func_registry__ = {{
+    "1\\000{}\\000",    __matxscript_func_array__,
+}};
+
+}} // extern C
+
+extern "C" {{
+
+MATX_DLL const char* __matxscript_closures_names__ = "1\\000{}\\000";
+
+}} // extern C
+
+    '''
+    return template.format(kernel_name, kernel_name, kernel_name)
+
+
+def get_c_api_declare(kernel_name):
+    return f'int {kernel_name}__c_api(MATXScriptAny*, int, MATXScriptAny*, void*);'
+
+
+def extract_cpp_code(code: str):
+    return code.split("'''")[1][1:-1]
+
+
+def matx_cpp_code_format(code: str) -> str:
+    code = extract_cpp_code(code)
+    # split include and kernel code
+    first_newline_idx = code.find('\n')
+    include_code_str = code[:first_newline_idx]
+    kernel_code_str = code[first_newline_idx + 1:]
+
+    # add matx include
+    include_code_str += MATX_INCLUDE
+
+    # extract kernel declaration
+    first_open_bracket = kernel_code_str.find('{')
+    kernel_declaration_str = kernel_code_str[:first_open_bracket]
+    kernel_body_str = kernel_code_str[first_open_bracket:]
+
+    kernel_declaration = cpp_parse.parse_cpp_declaration(kernel_declaration_str)
+
+    kernel_declaration_without_default = copy.deepcopy(kernel_declaration)
+    kernel_declaration_without_default.append_arg(SESSION_HANLDER)
+    kernel_declaration_with_default = copy.deepcopy(kernel_declaration)
+    kernel_declaration_with_default.append_arg(SESSION_HANLDER_WITH_DEAFULT)
+
+    # add kernel declaration and c-api
+    function_declaration_str = str(kernel_declaration_with_default) + ';' + '\n\n' + \
+                               get_c_api_declare(kernel_declaration_with_default.func_name) + '\n'
+
+    # add kernel
+    kernel_impl_str = str(kernel_declaration_without_default) + '\n' + kernel_body_str
+
+    # add kernel c-api
+
+    kernel_c_api_impl_str = get_c_api(kernel_name=kernel_declaration.func_name,
+                                      args=kernel_declaration.args,
+                                      has_return_value=kernel_declaration.return_type.name != 'void')
+
+    # add namespace
+    kernel_code_str = ['namespace {', function_declaration_str, kernel_impl_str,
+                       kernel_c_api_impl_str, '} // namespace']
+    kernel_code_str = '\n\n'.join(kernel_code_str)
+
+    # registration str
+    registration_code_str = get_registration_str(kernel_name=kernel_declaration.func_name)
+
+    # final code
+    final_code = [include_code_str, kernel_code_str, registration_code_str]
+
+    final_code = '\n\n'.join(final_code)
+
+    return final_code
+
+
+"""
+Use a global variable to hack the compile_fx_inner and record the compiled code.
+This works in single process problem, but requires careful review in multi-processing
+"""
+
+
+class FakeCallableWithCode():
+    code = None
+
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def set_code(self, code):
+        self.code = code
+
+
+fake_callable = FakeCallableWithCode()
+
+
+@DebugContext.wrap
+@torch.utils._python_dispatch._disable_current_modes()
+def compile_fx_inner_cpu(
+        gm: torch.fx.GraphModule,
+        example_inputs: List[torch.Tensor],
+        cudagraphs=None,
+        num_fixed=0,
+        is_backward=False,
+        graph_id=None,
+):
+    # lift the maximum depth of the Python interpreter stack
+    # to adapt large/deep models
+    compile_fx.sys.setrecursionlimit(max(compile_fx.sys.getrecursionlimit(), 2000))
+    V.debug.fx_graph(gm, example_inputs)
+    shape_env = compile_fx._shape_env_from_inputs(example_inputs)
+    fake_mode = compile_fx.fake_mode_from_tensors(example_inputs)
+    graph = compile_fx.GraphLowering(
+        gm,
+        shape_env=shape_env,
+        num_static_inputs=num_fixed,
+        graph_id=graph_id,
+        fake_mode=fake_mode,
+    )
+    with V.set_graph_handler(graph):
+        graph.run(*example_inputs)
+        code = graph.codegen()
+        fake_callable.set_code(code)
+
+    return fake_callable
+
+
+def extract_inductor_code(kernel, example_inputs):
+    model = fx.symbolic_trace(kernel)
+    compile_fx.compile_fx(model, example_inputs_=example_inputs, inner_compile=compile_fx_inner_cpu)
+
+    code = fake_callable.code
+    return code
diff --git a/examples/inductor/simple_inductor.py b/python/matx/torch_compiler/tests/simple_inductor.py
similarity index 56%
rename from examples/inductor/simple_inductor.py
rename to python/matx/torch_compiler/tests/simple_inductor.py
index a4ef44ff..cee50ce5 100644
--- a/examples/inductor/simple_inductor.py
+++ b/python/matx/torch_compiler/tests/simple_inductor.py
@@ -1,21 +1,16 @@
+import json
+
 import matx
 import torch
-import numpy as np
-import json
 
 
+@matx.inductor(example_inputs=[torch.randn(5), torch.randn(5)])
 def kernel(a: matx.NDArray, b: matx.NDArray):
     c = a + b
     c = torch.nn.functional.relu(c)
     return c,
 
 
-add_kernel = matx.inductor(kernel, example_inputs=[
-    torch.randn(5),
-    torch.randn(5)
-])
-
-
 @matx.script
 def add_json(a: str, b: str) -> str:
     """
@@ -28,7 +23,7 @@ def add_json(a: str, b: str) -> str:
     b_tensor = matx.NDArray(arr=b_list, shape=[5], dtype='float32')
     c_tensor = matx.NDArray(arr=b_list, shape=[5], dtype='float32')
 
-    add_kernel(a_tensor, b_tensor, c_tensor)
+    kernel(a_tensor, b_tensor, c_tensor)
 
     result_lst = c_tensor.tolist()
 
@@ -37,29 +32,6 @@ def add_json(a: str, b: str) -> str:
 
 if __name__ == '__main__':
     print(f'Pytorch version {torch.__version__}')
-
-    a_np = np.random.randn(5).astype(np.float32)
-    b_np = np.random.randn(5).astype(np.float32)
-    c_np = np.random.randn(5).astype(np.float32)
-
-    a = matx.NDArray([], a_np.shape, str(a_np.dtype))
-    a.from_numpy(a_np)
-
-    b = matx.NDArray([], b_np.shape, str(b_np.dtype))
-    b.from_numpy(b_np)
-
-    c = matx.NDArray([], c_np.shape, str(c_np.dtype))
-    c.from_numpy(c_np)
-
-    print(a)
-    print(b)
-    print(c)
-
-    print(kernel(a.torch(), b.torch()))
-
-    d = add_kernel(a, b, c)
-    print(c)
-
     a = json.dumps([1, 2, 3, 4, 5])
     b = json.dumps([6, 7, 8, 9, 10])
     result = add_json(a, b)
diff --git a/python/matx/torch_compiler/utils/__init__.py b/python/matx/torch_compiler/utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/matx/torch_compiler/utils/cpp_parse.py b/python/matx/torch_compiler/utils/cpp_parse.py
new file mode 100644
index 00000000..e0d9611d
--- /dev/null
+++ b/python/matx/torch_compiler/utils/cpp_parse.py
@@ -0,0 +1,141 @@
+import dataclasses
+from typing import List, Union
+
+
+@dataclasses.dataclass
+class CPPType(object):
+    name: str = None
+    is_pointer: bool = False
+
+    def __str__(self):
+        result = self.name
+        if self.is_pointer:
+            result += '*'
+
+        return result
+
+
+@dataclasses.dataclass
+class CPPArg(object):
+    name: str = None
+    type: CPPType = CPPType()
+    is_const: bool = False
+    is_restricted: bool = False
+    default_val: Union[str, None] = None
+
+    def __str__(self):
+        result = []
+        if self.is_const:
+            result.append('const')
+        result.append(str(self.type))
+        if self.is_restricted:
+            result.append('__restrict__')
+        result.append(self.name)
+
+        if self.default_val is not None:
+            result.append(f'= {self.default_val}')
+
+        return ' '.join(result)
+
+
+def parse_cpp_arg(cpp_arg_str: str) -> CPPArg:
+    """Parse the C++ arg from a string such as const float* __restrict__ a = null_ptr
+
+    :param cpp_arg_str: the string of the argument
+    :return: a CPPArg dataclass
+    """
+
+    cpp_arg = CPPArg()
+
+    # find if there is a default value
+    if '=' in cpp_arg_str:
+        cpp_arg_str, default_val = cpp_arg_str.split('=')
+        default_val = default_val.replace(' ', '')
+        cpp_arg.default_val = default_val
+
+    word = cpp_arg_str.split()
+
+    cpp_arg.name = word[-1]
+
+    for w in word[:-1]:
+        if w == 'const':
+            cpp_arg.is_const = True
+        elif w == '*':
+            cpp_arg.type.is_pointer = True
+        elif w == '__restrict__':
+            cpp_arg.is_restricted = True
+        else:
+            # type
+            if w[-1] == '*':
+                cpp_arg.type.is_pointer = True
+                w = w[:-1]  # remove *
+            cpp_arg.type.name = w
+
+    return cpp_arg
+
+
+@dataclasses.dataclass
+class CPPDeclaration(object):
+    func_name: str = None
+    return_type: CPPType = CPPType()
+    args: List[CPPArg] = dataclasses.field(default_factory=list)
+    is_extern_c: bool = False
+
+    def append_arg(self, arg: CPPArg):
+        self.args.append(arg)
+
+    def __str__(self):
+        result = []
+        if self.is_extern_c:
+            result.append('extern "C"')
+        result.append(str(self.return_type))
+        result.append(self.func_name)
+
+        front = ' '.join(result)
+        num_spaces = len(front) + 1
+        interval = ',\n' + ' ' * num_spaces
+
+        args_str = interval.join([str(arg) for arg in self.args])
+
+        return front + '(' + args_str + ')'
+
+
+def parse_cpp_declaration(cpp_declaration_str: str) -> CPPDeclaration:
+    """Parse the CPP declaration in string and return a CPPDeclaration.
+
+    :param cpp_declaration_str:
+    :return:
+    """
+    cpp_declaration = CPPDeclaration()
+
+    identifier_return_name, cpp_arg_str = cpp_declaration_str.split('(')
+    cpp_arg_str = cpp_arg_str.split(')')[0]
+    cpp_arg_str_lst = cpp_arg_str.split(',')
+    # arguments
+    for cpp_arg_str in cpp_arg_str_lst:
+        cpp_declaration.args.append(parse_cpp_arg(cpp_arg_str))
+
+    # process return type and function name
+    identifier_return_name_lst = identifier_return_name.split()
+    if identifier_return_name_lst[0] == 'extern' and identifier_return_name_lst[1] == '"C"':
+        cpp_declaration.is_extern_c = True
+        identifier_return_name_lst = identifier_return_name_lst[2:]
+
+    cpp_declaration.func_name = identifier_return_name_lst[-1]
+    # remove func_name
+    return_type_str_lst = identifier_return_name_lst[:-1]
+
+    if len(return_type_str_lst) == 1:
+        return_type_str = return_type_str_lst[0]
+        if return_type_str[-1] == '*':
+            cpp_declaration.return_type.name = return_type_str[:-1]
+            cpp_declaration.return_type.is_pointer = True
+        else:
+            cpp_declaration.return_type.name = return_type_str
+    else:
+        assert len(return_type_str_lst) == 2
+        assert return_type_str_lst[-1] == '*'
+        cpp_declaration.return_type.name = return_type_str_lst[0]
+        cpp_declaration.return_type.is_pointer = True
+
+    return cpp_declaration

From 3b7d7373ce26ddc9da884ff59b4768ef84e020f7 Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Tue, 31 Jan 2023 15:41:22 +0800
Subject: [PATCH 06/21] [matx.inductor] fix kernel name

---
 python/matx/inductor/__init__.py               |  4 ++--
 python/matx/torch_compiler/codegen.py          | 18 +++++++++++++-----
 .../torch_compiler/tests/simple_inductor.py    |  4 ++--
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/python/matx/inductor/__init__.py b/python/matx/inductor/__init__.py
index b3a61d84..88e87920 100644
--- a/python/matx/inductor/__init__.py
+++ b/python/matx/inductor/__init__.py
@@ -12,8 +12,8 @@
 def from_source(compiling_obj: type, example_inputs: List[torch.Tensor]) -> context.ScriptContext:
     try:
 
-        code = extract_inductor_code(compiling_obj, example_inputs)
-        code = matx_cpp_code_format(code)
+        code, kernel_name = extract_inductor_code(compiling_obj, example_inputs)
+        code = matx_cpp_code_format(code, kernel_name)
 
         sc_ctx = context.ScriptContext()
         sc_ctx.build_type = context.BuildType.FUNCTION
diff --git a/python/matx/torch_compiler/codegen.py b/python/matx/torch_compiler/codegen.py
index 4d2566c9..2d84345c 100644
--- a/python/matx/torch_compiler/codegen.py
+++ b/python/matx/torch_compiler/codegen.py
@@ -35,7 +35,7 @@ def generate_ndarray_arg_cast(arg_name, arg_index, dtype, message='TODO'):
 
 def get_c_api(kernel_name: str, args: List[cpp_parse.CPPArg], has_return_value) -> str:
     template_with_return = '''
-int kernel__c_api(MATXScriptAny* args, int num_args, MATXScriptAny* out_ret_value, void* resource_handle = nullptr)
+int {}__c_api(MATXScriptAny* args, int num_args, MATXScriptAny* out_ret_value, void* resource_handle = nullptr)
 {{
   TArgs args_t(args, num_args);
 
@@ -63,7 +63,7 @@ def get_c_api(kernel_name: str, args: List[cpp_parse.CPPArg], has_return_value)
 }}
 '''
     template_without_return = '''
-int kernel__c_api(MATXScriptAny* args, int num_args, MATXScriptAny* out_ret_value, void* resource_handle = nullptr)
+int {}__c_api(MATXScriptAny* args, int num_args, MATXScriptAny* out_ret_value, void* resource_handle = nullptr)
 {{
   TArgs args_t(args, num_args);
 
@@ -115,7 +115,7 @@ def get_c_api(kernel_name: str, args: List[cpp_parse.CPPArg], has_return_value)
     pos_arg_cast = (',' + pos_arg_cast_indentation).join(pos_arg_cast_lst)
     args_t_cast = (',' + args_t_cast_indentation).join(args_t_cast_lst)
 
-    return template.format(num_args, arg_names_concat_str, kernel_name, num_args, num_args, kernel_name,
+    return template.format(kernel_name, num_args, arg_names_concat_str, kernel_name, num_args, num_args, kernel_name,
                            pos_arg_cast, kernel_name_indentation, num_args, kernel_name,
                            args_t_cast, kernel_name_indentation)
 
@@ -152,7 +152,7 @@ def extract_cpp_code(code: str):
     return code.split("'''")[1][1:-1]
 
 
-def matx_cpp_code_format(code: str) -> str:
+def matx_cpp_code_format(code: str, kernel_name: str) -> str:
     code = extract_cpp_code(code)
     # split include and kernel code
     first_newline_idx = code.find('\n')
@@ -168,6 +168,8 @@ def matx_cpp_code_format(code: str) -> str:
     kernel_body_str = kernel_code_str[first_open_bracket:]
 
     kernel_declaration = cpp_parse.parse_cpp_declaration(kernel_declaration_str)
+    # TODO: remove this hack after port to C++ codegen
+    kernel_declaration.func_name = kernel_name
 
     kernel_declaration_without_default = copy.deepcopy(kernel_declaration)
     kernel_declaration_without_default.append_arg(SESSION_HANLDER)
@@ -258,4 +260,10 @@ def extract_inductor_code(kernel, example_inputs):
     compile_fx.compile_fx(model, example_inputs_=example_inputs, inner_compile=compile_fx_inner_cpu)
 
     code = fake_callable.code
-    return code
+
+    # By default, Pytorch compiles a Python module with all the C++ kernel with unified name kernel.
+    # The actual kernel name should be kernel.__name__.
+    # TODO: fix this after rewriting inductor codegen to all C++ instead of a Python module
+    kernel_name = kernel.__name__
+
+    return code, kernel_name
diff --git a/python/matx/torch_compiler/tests/simple_inductor.py b/python/matx/torch_compiler/tests/simple_inductor.py
index cee50ce5..a48ed2cf 100644
--- a/python/matx/torch_compiler/tests/simple_inductor.py
+++ b/python/matx/torch_compiler/tests/simple_inductor.py
@@ -5,7 +5,7 @@
 
 
 @matx.inductor(example_inputs=[torch.randn(5), torch.randn(5)])
-def kernel(a: matx.NDArray, b: matx.NDArray):
+def add_relu(a: matx.NDArray, b: matx.NDArray):
     c = a + b
     c = torch.nn.functional.relu(c)
     return c,
@@ -23,7 +23,7 @@ def add_json(a: str, b: str) -> str:
     b_tensor = matx.NDArray(arr=b_list, shape=[5], dtype='float32')
     c_tensor = matx.NDArray(arr=b_list, shape=[5], dtype='float32')
 
-    kernel(a_tensor, b_tensor, c_tensor)
+    add_relu(a_tensor, b_tensor, c_tensor)
 
     result_lst = c_tensor.tolist()
 

From 79de762ddf9085bfe1780d72e6e5e76e4f61c5de Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Wed, 1 Feb 2023 15:29:11 +0800
Subject: [PATCH 07/21] update matx formatter to match the raw function
 signature

---
 cpp_playground/main.cpp                       |   3 +
 python/matx/torch_compiler/codegen.py         | 269 ---------------
 .../{utils => codegen}/__init__.py            |   0
 .../codegen/inductor/__init__.py              |   0
 .../torch_compiler/codegen/matx_formatter.py  | 321 ++++++++++++++++++
 .../torch_compiler/codegen/utils/__init__.py  |   0
 .../{ => codegen}/utils/cpp_parse.py          |   0
 .../torch_compiler/tests/nested_inputs.py     |   0
 test/inductor/test_basic.py                   |   0
 9 files changed, 324 insertions(+), 269 deletions(-)
 create mode 100644 cpp_playground/main.cpp
 delete mode 100644 python/matx/torch_compiler/codegen.py
 rename python/matx/torch_compiler/{utils => codegen}/__init__.py (100%)
 create mode 100644 python/matx/torch_compiler/codegen/inductor/__init__.py
 create mode 100644 python/matx/torch_compiler/codegen/matx_formatter.py
 create mode 100644 python/matx/torch_compiler/codegen/utils/__init__.py
 rename python/matx/torch_compiler/{ => codegen}/utils/cpp_parse.py (100%)
 create mode 100644 python/matx/torch_compiler/tests/nested_inputs.py
 create mode 100644 test/inductor/test_basic.py

diff --git a/cpp_playground/main.cpp b/cpp_playground/main.cpp
new file mode 100644
index 00000000..c302b43a
--- /dev/null
+++ b/cpp_playground/main.cpp
@@ -0,0 +1,3 @@
+//
+// Created by ByteDance on 2023/2/1.
+//
diff --git a/python/matx/torch_compiler/codegen.py b/python/matx/torch_compiler/codegen.py
deleted file mode 100644
index 2d84345c..00000000
--- a/python/matx/torch_compiler/codegen.py
+++ /dev/null
@@ -1,269 +0,0 @@
-import copy
-import logging
-from typing import List
-
-import torch
-import torch._inductor.compile_fx as compile_fx
-from torch import fx
-from torch._inductor.debug import DebugContext
-from torch._inductor.virtualized import V
-
-from .utils import cpp_parse
-
-log = logging.getLogger(__name__)
-
-MATX_INCLUDE = '''
-#include "matxscript/runtime/codegen_all_includes.h"
-#include <math.h>
-
-using namespace ::matxscript::runtime;
-extern "C" void* __matxscript_module_ctx = NULL;
-
-extern "C" MATX_DLL MATXScriptFuncRegistry __matxscript_func_registry__;
-'''
-
-SESSION_HANLDER = cpp_parse.CPPArg(name='handle_2_71828182846',
-                                   type=cpp_parse.CPPType(name='void', is_pointer=True))
-SESSION_HANLDER_WITH_DEAFULT = cpp_parse.CPPArg(name='handle_2_71828182846',
-                                                type=cpp_parse.CPPType(name='void', is_pointer=True),
-                                                default_val='((void*)(int64_t)0)')
-
-
-def generate_ndarray_arg_cast(arg_name, arg_index, dtype, message='TODO'):
-    return f'({dtype}*)internal::TypeAsHelper<NDArray>::run(({arg_name}[{arg_index}]), __FILE__, __LINE__, "{message}", "{message}").Data<{dtype}>()'
-
-
-def get_c_api(kernel_name: str, args: List[cpp_parse.CPPArg], has_return_value) -> str:
-    template_with_return = '''
-int {}__c_api(MATXScriptAny* args, int num_args, MATXScriptAny* out_ret_value, void* resource_handle = nullptr)
-{{
-  TArgs args_t(args, num_args);
-
-  if (num_args > 0 && args[num_args - 1].code == TypeIndex::kRuntimeKwargs) {{
-    string_view arg_names[{}] {{{}}};
-    KwargsUnpackHelper helper("{}", arg_names, {}, nullptr, 0);
-    RTView pos_args[{}];
-    helper.unpack(pos_args, args, num_args);  // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5
-
-    auto ret = {}({}, 
-                {}resource_handle);
-    RTValue(std::move(ret)).MoveToCHost(out_ret_value);
-  }} else {{
-    switch(num_args) {{
-      case {}: {{
-        auto ret = {}({}, 
-                    {}resource_handle);  // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5
-        RTValue(std::move(ret)).MoveToCHost(out_ret_value);
-      }} break;
-      default: {{THROW_PY_TypeError("TODO");}} break;  // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5
-    }}
-  }}
-
-  return 0;
-}}
-'''
-    template_without_return = '''
-int {}__c_api(MATXScriptAny* args, int num_args, MATXScriptAny* out_ret_value, void* resource_handle = nullptr)
-{{
-  TArgs args_t(args, num_args);
-
-  if (num_args > 0 && args[num_args - 1].code == TypeIndex::kRuntimeKwargs) {{
-    string_view arg_names[{}] {{{}}};
-    KwargsUnpackHelper helper("{}", arg_names, {}, nullptr, 0);
-    RTView pos_args[{}];
-    helper.unpack(pos_args, args, num_args);  // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5
-
-    {}({}, 
-     {}resource_handle);
-  }} else {{
-    switch(num_args) {{
-      case {}: {{
-        {}({}, 
-         {}resource_handle);  // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5
-        int ret = 1;
-        RTValue(std::move(ret)).MoveToCHost(out_ret_value);
-      }} break;
-      default: {{THROW_PY_TypeError("TODO");}} break;  // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5
-    }}
-  }}
-
-  return 0;
-}}
-'''
-    if has_return_value:
-        template = template_with_return
-    else:
-        template = template_without_return
-
-    num_args = len(args)
-    arg_names_concat_str = ', '.join([f'"{arg.name}"' for arg in args])
-    args_dtype = [arg.type.name for arg in args]
-
-    pos_arg_cast_lst = []
-    args_t_cast_lst = []
-    for arg_index in range(num_args):
-        pos_arg_cast_lst.append(generate_ndarray_arg_cast('pos_args', arg_index, args_dtype[arg_index]))
-        args_t_cast_lst.append(generate_ndarray_arg_cast('args_t', arg_index, args_dtype[arg_index]))
-
-    kernel_name_indentation = len(kernel_name) * ' '
-    if has_return_value:
-        return_name_indentation = ' ' * 11
-    else:
-        return_name_indentation = ''
-    pos_arg_cast_indentation = '\n     ' + kernel_name_indentation + return_name_indentation
-    args_t_cast_indentation = '\n         ' + kernel_name_indentation + return_name_indentation
-    pos_arg_cast = (',' + pos_arg_cast_indentation).join(pos_arg_cast_lst)
-    args_t_cast = (',' + args_t_cast_indentation).join(args_t_cast_lst)
-
-    return template.format(kernel_name, num_args, arg_names_concat_str, kernel_name, num_args, num_args, kernel_name,
-                           pos_arg_cast, kernel_name_indentation, num_args, kernel_name,
-                           args_t_cast, kernel_name_indentation)
-
-
-def get_registration_str(kernel_name):
-    # TODO: currently, only 1 function is here.
-    template = '''
-extern "C" {{
-
-MATX_DLL MATXScriptBackendPackedCFunc __matxscript_func_array__[] = {{
-    (MATXScriptBackendPackedCFunc){}__c_api,
-}};
-MATX_DLL MATXScriptFuncRegistry __matxscript_func_registry__ = {{
-    "1\\000{}\\000",    __matxscript_func_array__,
-}};
-
-}} // extern C
-
-extern "C" {{
-
-MATX_DLL const char* __matxscript_closures_names__ = "1\\000{}\\000";
-
-}} // extern C
-
-    '''
-    return template.format(kernel_name, kernel_name, kernel_name)
-
-
-def get_c_api_declare(kernel_name):
-    return f'int {kernel_name}__c_api(MATXScriptAny*, int, MATXScriptAny*, void*);'
-
-
-def extract_cpp_code(code: str):
-    return code.split("'''")[1][1:-1]
-
-
-def matx_cpp_code_format(code: str, kernel_name: str) -> str:
-    code = extract_cpp_code(code)
-    # split include and kernel code
-    first_newline_idx = code.find('\n')
-    include_code_str = code[:first_newline_idx]
-    kernel_code_str = code[first_newline_idx + 1:]
-
-    # add matx include
-    include_code_str += MATX_INCLUDE
-
-    # extract kernel declaration
-    first_open_bracket = kernel_code_str.find('{')
-    kernel_declaration_str = kernel_code_str[:first_open_bracket]
-    kernel_body_str = kernel_code_str[first_open_bracket:]
-
-    kernel_declaration = cpp_parse.parse_cpp_declaration(kernel_declaration_str)
-    # TODO: remove this hack after port to C++ codegen
-    kernel_declaration.func_name = kernel_name
-
-    kernel_declaration_without_default = copy.deepcopy(kernel_declaration)
-    kernel_declaration_without_default.append_arg(SESSION_HANLDER)
-    kernel_declaration_with_default = copy.deepcopy(kernel_declaration)
-    kernel_declaration_with_default.append_arg(SESSION_HANLDER_WITH_DEAFULT)
-
-    # add kernel declaration and c-api
-    function_declaration_str = str(kernel_declaration_with_default) + ';' + '\n\n' + \
-                               get_c_api_declare(kernel_declaration_with_default.func_name) + '\n'
-
-    # add kernel
-    kernel_impl_str = str(kernel_declaration_without_default) + '\n' + kernel_body_str
-
-    # add kernel c-api
-
-    kernel_c_api_impl_str = get_c_api(kernel_name=kernel_declaration.func_name,
-                                      args=kernel_declaration.args,
-                                      has_return_value=kernel_declaration.return_type.name != 'void')
-
-    # add namespace
-    kernel_code_str = ['namespace {', function_declaration_str, kernel_impl_str,
-                       kernel_c_api_impl_str, '} // namespace']
-    kernel_code_str = '\n\n'.join(kernel_code_str)
-
-    # registration str
-    registration_code_str = get_registration_str(kernel_name=kernel_declaration.func_name)
-
-    # final code
-    final_code = [include_code_str, kernel_code_str, registration_code_str]
-
-    final_code = '\n\n'.join(final_code)
-
-    return final_code
-
-
-"""
-Use a global variable to hack the compile_fx_inner and record the compiled code.
-This works in single process problem, but requires careful review in multi-processing
-"""
-
-
-class FakeCallableWithCode():
-    code = None
-
-    def __call__(self, *args, **kwargs):
-        raise NotImplementedError
-
-    def set_code(self, code):
-        self.code = code
-
-
-fake_callable = FakeCallableWithCode()
-
-
-@DebugContext.wrap
-@torch.utils._python_dispatch._disable_current_modes()
-def compile_fx_inner_cpu(
-        gm: torch.fx.GraphModule,
-        example_inputs: List[torch.Tensor],
-        cudagraphs=None,
-        num_fixed=0,
-        is_backward=False,
-        graph_id=None,
-):
-    # lift the maximum depth of the Python interpreter stack
-    # to adapt large/deep models
-    compile_fx.sys.setrecursionlimit(max(compile_fx.sys.getrecursionlimit(), 2000))
-    V.debug.fx_graph(gm, example_inputs)
-    shape_env = compile_fx._shape_env_from_inputs(example_inputs)
-    fake_mode = compile_fx.fake_mode_from_tensors(example_inputs)
-    graph = compile_fx.GraphLowering(
-        gm,
-        shape_env=shape_env,
-        num_static_inputs=num_fixed,
-        graph_id=graph_id,
-        fake_mode=fake_mode,
-    )
-    with V.set_graph_handler(graph):
-        graph.run(*example_inputs)
-        code = graph.codegen()
-        fake_callable.set_code(code)
-
-    return fake_callable
-
-
-def extract_inductor_code(kernel, example_inputs):
-    model = fx.symbolic_trace(kernel)
-    compile_fx.compile_fx(model, example_inputs_=example_inputs, inner_compile=compile_fx_inner_cpu)
-
-    code = fake_callable.code
-
-    # By default, Pytorch compiles a Python module with all the C++ kernel with unified name kernel.
-    # The actual kernel name should be kernel.__name__.
-    # TODO: fix this after rewriting inductor codegen to all C++ instead of a Python module
-    kernel_name = kernel.__name__
-
-    return code, kernel_name
diff --git a/python/matx/torch_compiler/utils/__init__.py b/python/matx/torch_compiler/codegen/__init__.py
similarity index 100%
rename from python/matx/torch_compiler/utils/__init__.py
rename to python/matx/torch_compiler/codegen/__init__.py
diff --git a/python/matx/torch_compiler/codegen/inductor/__init__.py b/python/matx/torch_compiler/codegen/inductor/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/matx/torch_compiler/codegen/matx_formatter.py b/python/matx/torch_compiler/codegen/matx_formatter.py
new file mode 100644
index 00000000..36cce17f
--- /dev/null
+++ b/python/matx/torch_compiler/codegen/matx_formatter.py
@@ -0,0 +1,321 @@
+# Copyright 2022 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Utilities to format kernel code generated by inductor to a JITOp
+"""
+
+import copy
+import logging
+from typing import List
+
+import torch
+
+from .utils import cpp_parse
+
+log = logging.getLogger(__name__)
+
+MAGIC_NUMBER = '2_71828182846'
+
+MATX_INCLUDE = '''
+#include "matxscript/runtime/codegen_all_includes.h"
+#include <math.h>
+
+using namespace ::matxscript::runtime;
+extern "C" void* __matxscript_module_ctx = NULL;
+
+extern "C" MATX_DLL MATXScriptFuncRegistry __matxscript_func_registry__;
+
+
+
+'''
+
+SESSION_HANLDER = cpp_parse.CPPArg(name=f'handle_{MAGIC_NUMBER}',
+                                   type=cpp_parse.CPPType(name='void', is_pointer=True))
+SESSION_HANLDER_WITH_DEAFULT = cpp_parse.CPPArg(name=f'handle_{MAGIC_NUMBER}',
+                                                type=cpp_parse.CPPType(name='void', is_pointer=True),
+                                                default_val='((void*)(int64_t)0)')
+
+CREATE_NDARRAY_DECLARATION = '''
+// helper function to create NDArray
+NDArray createNDArray(const std::string& dtype,
+                      const std::string& device,
+                      const std::vector<int64_t>& arg_shape);
+'''
+
+CREATE_NDARRAY_IMPLEMENTATION = '''
+NDArray createNDArray(const std::string& dtype,
+                      const std::string& device,
+                      const std::vector<int64_t>& arg_shape) {
+  Unicode dtype_str(UTF8Decode(dtype));
+  Unicode ctx_str(UTF8Decode(device));
+  DataType data_type(String2DLDataType(UTF8Encode(dtype_str.view())));
+  return NDArray::Empty(arg_shape, data_type, NDArrayHelper::GetDevice(ctx_str));
+}
+'''
+
+
+def generate_ndarray_arg_cast(arg_name, arg_index, message='TODO'):
+    return f'internal::TypeAsHelper<NDArray>::run(({arg_name}[{arg_index}]), __FILE__, __LINE__, "{message}", "{message}")'
+
+
+def get_c_api(kernel_name: str, args: List[cpp_parse.CPPArg], has_return_value) -> str:
+    template_with_return = '''
+int {}__c_api(MATXScriptAny* args, int num_args, MATXScriptAny* out_ret_value, void* resource_handle = nullptr)
+{{
+  TArgs args_t(args, num_args);
+
+  if (num_args > 0 && args[num_args - 1].code == TypeIndex::kRuntimeKwargs) {{
+    string_view arg_names[{}] {{{}}};
+    KwargsUnpackHelper helper("{}", arg_names, {}, nullptr, 0);
+    RTView pos_args[{}];
+    helper.unpack(pos_args, args, num_args);  // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5
+
+    auto ret = {}({}, 
+                {}resource_handle);
+    RTValue(std::move(ret)).MoveToCHost(out_ret_value);
+  }} else {{
+    switch(num_args) {{
+      case {}: {{
+        auto ret = {}({}, 
+                    {}resource_handle);  // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5
+        RTValue(std::move(ret)).MoveToCHost(out_ret_value);
+      }} break;
+      default: {{THROW_PY_TypeError("TODO");}} break;  // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5
+    }}
+  }}
+
+  return 0;
+}}
+'''
+    assert has_return_value
+    template = template_with_return
+
+    num_args = len(args)
+    arg_names_concat_str = ', '.join([f'"{arg.name}"' for arg in args])
+    args_dtype = [arg.type.name for arg in args]
+
+    pos_arg_cast_lst = []
+    args_t_cast_lst = []
+    for arg_index in range(num_args):
+        pos_arg_cast_lst.append(generate_ndarray_arg_cast('pos_args', arg_index))
+        args_t_cast_lst.append(generate_ndarray_arg_cast('args_t', arg_index))
+
+    kernel_name_indentation = len(kernel_name) * ' '
+    if has_return_value:
+        return_name_indentation = ' ' * 11
+    else:
+        return_name_indentation = ''
+    pos_arg_cast_indentation = '\n     ' + kernel_name_indentation + return_name_indentation
+    args_t_cast_indentation = '\n         ' + kernel_name_indentation + return_name_indentation
+    pos_arg_cast = (',' + pos_arg_cast_indentation).join(pos_arg_cast_lst)
+    args_t_cast = (',' + args_t_cast_indentation).join(args_t_cast_lst)
+
+    return template.format(kernel_name, num_args, arg_names_concat_str, kernel_name, num_args, num_args, kernel_name,
+                           pos_arg_cast, kernel_name_indentation, num_args, kernel_name,
+                           args_t_cast, kernel_name_indentation)
+
+
+def get_registration_str(kernel_name):
+    # TODO: currently, only 1 function is here.
+    template = '''
+extern "C" {{
+
+MATX_DLL MATXScriptBackendPackedCFunc __matxscript_func_array__[] = {{
+    (MATXScriptBackendPackedCFunc){}__c_api,
+}};
+MATX_DLL MATXScriptFuncRegistry __matxscript_func_registry__ = {{
+    "1\\000{}\\000",    __matxscript_func_array__,
+}};
+
+}} // extern C
+
+extern "C" {{
+
+MATX_DLL const char* __matxscript_closures_names__ = "1\\000{}\\000";
+
+}} // extern C
+
+    '''
+    return template.format(kernel_name, kernel_name, kernel_name)
+
+
+def get_c_api_declare(kernel_name):
+    return f'int {kernel_name}__c_api(MATXScriptAny*, int, MATXScriptAny*, void*);'
+
+
+def extract_cpp_code(code: str):
+    return code.split("'''")[1][1:-1]
+
+
+def split_include_kernel(code):
+    first_newline_idx = code.find('\n')
+    include_code_str = code[:first_newline_idx]
+    kernel_code_str = code[first_newline_idx + 1:]
+    return include_code_str, kernel_code_str
+
+
+def split_declaration_body(kernel_code_str):
+    first_open_bracket = kernel_code_str.find('{')
+    kernel_declaration_str = kernel_code_str[:first_open_bracket]
+    kernel_body_str = kernel_code_str[first_open_bracket:]
+    return kernel_declaration_str, kernel_body_str
+
+
+def generate_kernel_wrapper_declaration(kernel_name, example_inputs):
+    return_type = cpp_parse.CPPType(name='Tuple', is_pointer=False)
+    args = []
+    for i in range(len(example_inputs)):
+        arg = cpp_parse.CPPArg(name=f'in_ptr{i}', type=cpp_parse.CPPType(name='NDArray', is_pointer=False),
+                               is_const=False, is_restricted=False)
+        args.append(arg)
+    kernel_wrapper_declaration = cpp_parse.CPPDeclaration(func_name=kernel_name,
+                                                          return_type=return_type,
+                                                          args=args,
+                                                          is_extern_c=False)
+    return kernel_wrapper_declaration
+
+
+def generate_ndarray_allocate_statement(output_name: str, dtype: str, device: str, shape: List[int]):
+    assert dtype in ['int32', 'int64', 'float32', 'float64']
+    assert device == 'cpu'
+    assert isinstance(shape, List)
+    for shape_int in shape:
+        assert isinstance(shape_int, int)
+
+    shape = [str(shape_int) for shape_int in shape]
+    shape_str = ', '.join(shape)
+
+    return f'NDArray {output_name} = createNDArray("{dtype}", "{device}", {{{shape_str}}});'
+
+
+def generate_ndarray_cast(var_name, dtype):
+    return f'({dtype}*){var_name}.Data<{dtype}>()'
+
+
+def generate_kernel_wrapper_return(fake_output):
+    output_str = [f'out_ptr{i}' for i in range(len(fake_output))]
+    output_str = ','.join(output_str)
+    return f'return Kernel_Tuple::make(std::initializer_list<Tuple::value_type>{{{output_str}}});'
+
+
+TORCH_DTYPE_TO_NDARRAY_DTYPE = {
+    torch.float32: 'float32',
+    torch.float64: 'float64',
+    torch.int32: 'int32',
+    torch.int64: 'int64'
+}
+
+
+def generate_kernel_wrapper_body(kernel_declaration: cpp_parse.CPPDeclaration,
+                                 fake_output: List[torch.Tensor]):
+    # step 0: obtain output args from kernel_declaration
+
+    # step 1: allocate output NDArray
+    ndarray_allocate_statements = []
+    for i, output in enumerate(fake_output):
+        assert output.dtype in TORCH_DTYPE_TO_NDARRAY_DTYPE
+        dtype = TORCH_DTYPE_TO_NDARRAY_DTYPE[output.dtype]
+
+        ndarray_allocate_statement = generate_ndarray_allocate_statement(output_name=f'out_ptr{i}',
+                                                                         dtype=dtype,
+                                                                         device=str(output.device),
+                                                                         shape=list(output.shape))
+        ndarray_allocate_statements.append(ndarray_allocate_statement)
+
+    ndarray_allocate_statements = '\n'.join(ndarray_allocate_statements) + '\n\n'
+
+    # step 2: invoke kernel
+    kernel_invoke_param = []
+    for arg in kernel_declaration.args:
+        kernel_invoke_param.append(generate_ndarray_cast(var_name=arg.name, dtype=arg.type.name))
+
+    num_space = 10
+    delimiter = ',\n' + ' ' * 10
+    kernel_invoke_param_str = delimiter.join(kernel_invoke_param)
+    kernel_invoke_str = kernel_declaration.func_name + '(' + '\n' + ' ' * num_space + \
+                        kernel_invoke_param_str + '\n' + ')' + '\n'
+
+    # step 3: return output as a Tuple
+    return_str = generate_kernel_wrapper_return(fake_output)
+
+    # step 4: add bracket
+    final_result = '\n{\n' + ndarray_allocate_statements + kernel_invoke_str + return_str + '\n}'
+
+    return final_result
+
+
+def matx_cpp_code_format(code: str, kernel_name: str,
+                         example_inputs: List[torch.Tensor],
+                         fake_output: List[torch.Tensor]) -> str:
+    code = extract_cpp_code(code)
+    # split include and kernel code
+
+    include_code_str, kernel_code_str = split_include_kernel(code)
+    # add matx include
+    include_code_str += MATX_INCLUDE
+
+    # extract kernel declaration
+    kernel_declaration_str, kernel_body_str = split_declaration_body(kernel_code_str)
+
+    kernel_declaration = cpp_parse.parse_cpp_declaration(kernel_declaration_str)
+    kernel_return_type = kernel_declaration.return_type.name
+    assert kernel_return_type == 'void', f'The kernel return type must be void, Got {kernel_return_type}'
+
+    kernel_declaration.func_name += MAGIC_NUMBER
+    kernel_code_str = str(kernel_declaration) + kernel_body_str
+
+    # here, we keep the original kernel and add a wrapper
+    kernel_wrapper_declaration = generate_kernel_wrapper_declaration(kernel_name, example_inputs)
+    kernel_wrapper_body = generate_kernel_wrapper_body(kernel_declaration, fake_output)
+
+    kernel_wrapper_declaration_without_default = copy.deepcopy(kernel_wrapper_declaration)
+    kernel_wrapper_declaration_without_default.append_arg(SESSION_HANLDER)
+    kernel_wrapper_declaration_with_default = copy.deepcopy(kernel_wrapper_declaration)
+    kernel_wrapper_declaration_with_default.append_arg(SESSION_HANLDER_WITH_DEAFULT)
+
+    # create all the declarations strings
+    function_declaration = [CREATE_NDARRAY_DECLARATION, str(kernel_wrapper_declaration_with_default),
+                            str(kernel_declaration) + ';', get_c_api_declare(kernel_wrapper_declaration.func_name)]
+
+    function_declaration_str = '\n\n'.join(function_declaration) + '\n'
+
+    # create all the kernel implementation strings including
+    # 1. create ndarray. 2. kernel wrapper, 3. kernel, 4. kernel-c-api
+    kernel_wrapper = str(kernel_wrapper_declaration) + kernel_wrapper_body
+    kernel_c_api_impl_str = get_c_api(kernel_name=kernel_wrapper_declaration.func_name,
+                                      args=kernel_wrapper_declaration.args,
+                                      has_return_value=kernel_wrapper_declaration.return_type.name != 'void')
+
+    implementations = [CREATE_NDARRAY_IMPLEMENTATION, kernel_wrapper, kernel_code_str, kernel_c_api_impl_str]
+    implementations_str = '\n\n'.join(implementations) + '\n'
+
+    # add namespace
+    kernel_code_str = ['namespace {', function_declaration_str, implementations_str, '} // namespace']
+    kernel_code_str = '\n\n'.join(kernel_code_str)
+
+    # registration str
+    registration_code_str = get_registration_str(kernel_name=kernel_declaration.func_name)
+
+    # final code
+    final_code = [include_code_str, kernel_code_str, registration_code_str]
+
+    final_code = '\n\n'.join(final_code)
+
+    return final_code
diff --git a/python/matx/torch_compiler/codegen/utils/__init__.py b/python/matx/torch_compiler/codegen/utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/matx/torch_compiler/utils/cpp_parse.py b/python/matx/torch_compiler/codegen/utils/cpp_parse.py
similarity index 100%
rename from python/matx/torch_compiler/utils/cpp_parse.py
rename to python/matx/torch_compiler/codegen/utils/cpp_parse.py
diff --git a/python/matx/torch_compiler/tests/nested_inputs.py b/python/matx/torch_compiler/tests/nested_inputs.py
new file mode 100644
index 00000000..e69de29b
diff --git a/test/inductor/test_basic.py b/test/inductor/test_basic.py
new file mode 100644
index 00000000..e69de29b

From b3c0289fb2eaf6301936dac0c80f652ea570cf07 Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Wed, 1 Feb 2023 15:58:56 +0800
Subject: [PATCH 08/21] add kernel wrapper to match the raw python function
 signature

---
 cpp_playground/main.cpp                       |  3 -
 python/matx/inductor/__init__.py              | 27 +++---
 python/matx/toolchain.py                      |  9 +-
 .../matx/torch_compiler/codegen/__init__.py   |  2 +
 .../codegen/inductor/__init__.py              | 90 +++++++++++++++++++
 .../torch_compiler/codegen/matx_formatter.py  | 13 +--
 .../torch_compiler/tests/nested_inputs.py     |  0
 .../torch_compiler/tests/simple_inductor.py   | 17 ++--
 .../matx/torch_compiler/tests/tuple_output.py |  8 ++
 9 files changed, 136 insertions(+), 33 deletions(-)
 delete mode 100644 cpp_playground/main.cpp
 delete mode 100644 python/matx/torch_compiler/tests/nested_inputs.py
 create mode 100644 python/matx/torch_compiler/tests/tuple_output.py

diff --git a/cpp_playground/main.cpp b/cpp_playground/main.cpp
deleted file mode 100644
index c302b43a..00000000
--- a/cpp_playground/main.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-//
-// Created by ByteDance on 2023/2/1.
-//
diff --git a/python/matx/inductor/__init__.py b/python/matx/inductor/__init__.py
index 88e87920..e19f2c57 100644
--- a/python/matx/inductor/__init__.py
+++ b/python/matx/inductor/__init__.py
@@ -2,23 +2,19 @@
 from typing import List
 
 import torch
-from matx.torch_compiler.codegen import extract_inductor_code, matx_cpp_code_format
 
 from matx.env import MATX_DEV_MODE
 from matx.script import context
 from matx.toolchain import path_prefix
+from matx.torch_compiler.codegen import extract_inductor_code, matx_cpp_code_format
 
 
 def from_source(compiling_obj: type, example_inputs: List[torch.Tensor]) -> context.ScriptContext:
     try:
-
-        code, kernel_name = extract_inductor_code(compiling_obj, example_inputs)
-        code = matx_cpp_code_format(code, kernel_name)
-
+        # set sc_ctx attributes to be compatible with existing matx code
         sc_ctx = context.ScriptContext()
         sc_ctx.build_type = context.BuildType.FUNCTION
         sc_ctx.main_node.raw = compiling_obj
-        # set sc_ctx attributes to be compatible with existing matx code
         inductor_context = context.InductorContext(fn_name=compiling_obj.__name__)
         sc_ctx.main_node.context = inductor_context
         # set source code TODO: formatting source code
@@ -27,6 +23,18 @@ def from_source(compiling_obj: type, example_inputs: List[torch.Tensor]) -> cont
         frame = inspect.stack()[3]
         sc_ctx.main_node.span.file_name = frame[0].f_code.co_filename
 
+        # set args types.
+        from .. import ir
+
+        # TODO: currently, we only support argument as NDArray. We may support nested inputs later
+        signature = inspect.signature(compiling_obj)
+        for param in signature.parameters.values():
+            sc_ctx.main_node.context.arg_types[param.name] = ir.type.NDArrayType()
+
+        # compile the kernel and set the code
+        code, kernel_name, fake_output = extract_inductor_code(compiling_obj, example_inputs)
+        code = matx_cpp_code_format(code, kernel_name, example_inputs, fake_output)
+
         # export code
         path = path_prefix(sc_ctx)
         with open(path, 'w') as f:
@@ -37,13 +45,6 @@ def from_source(compiling_obj: type, example_inputs: List[torch.Tensor]) -> cont
         build_module = _ffi.get_global_func("embedded.build.c")
         sc_ctx.rt_module = build_module(code.encode())
 
-        # set args types. # TODO: hardcode for now
-        from .. import ir
-        sc_ctx.main_node.context.arg_types = dict(
-            a=ir.type.NDArrayType(),
-            b=ir.type.NDArrayType()
-        )
-
         return sc_ctx
     except BaseException as e:
         if MATX_DEV_MODE:
diff --git a/python/matx/toolchain.py b/python/matx/toolchain.py
index bd946627..557bc278 100644
--- a/python/matx/toolchain.py
+++ b/python/matx/toolchain.py
@@ -400,23 +400,20 @@ def inductor(compiling_obj, example_inputs, *, share=True, toolchain=None, bundl
         '-I/Users/bytedance/miniforge3/envs/inductor/lib/python3.10/site-packages/torch/include/TH',
         '-I/Users/bytedance/miniforge3/envs/inductor/lib/python3.10/site-packages/torch/include/THC',
         '-I/Users/bytedance/miniforge3/envs/inductor/include/python3.10',
-        '-lgomp',
-        '-march=native',
+        # '-lgomp',
+        # '-march=native',
         '-ffast-math',
         '-fno-finite-math-only',
-        '-fopenmp',
+        # '-fopenmp',
         '-DC10_USING_CUSTOM_GENERATED_MACROS'
     ]
 
-
     build_dso(result, toolchain is not None, compile_options=torch_compiler_options)
     if toolchain is not None:
         toolchain_build(result, toolchain)
 
     if result.build_type is context.BuildType.FUNCTION:
         return make_jit_op_creator(result, share, bundle_args=bundle_args)()
-    elif result.build_type is context.BuildType.JIT_OBJECT:
-        return make_jit_object_creator(result, share, bundle_args=bundle_args)
     else:
         raise ValueError('Unsupported build_type: {}'.format(result.build_type))
 
diff --git a/python/matx/torch_compiler/codegen/__init__.py b/python/matx/torch_compiler/codegen/__init__.py
index e69de29b..6935d78a 100644
--- a/python/matx/torch_compiler/codegen/__init__.py
+++ b/python/matx/torch_compiler/codegen/__init__.py
@@ -0,0 +1,2 @@
+from .inductor import extract_inductor_code
+from .matx_formatter import matx_cpp_code_format
\ No newline at end of file
diff --git a/python/matx/torch_compiler/codegen/inductor/__init__.py b/python/matx/torch_compiler/codegen/inductor/__init__.py
index e69de29b..837d4448 100644
--- a/python/matx/torch_compiler/codegen/inductor/__init__.py
+++ b/python/matx/torch_compiler/codegen/inductor/__init__.py
@@ -0,0 +1,90 @@
+from typing import List, Tuple
+
+import torch
+import torch._inductor.compile_fx as compile_fx
+from torch import fx
+from torch._inductor.debug import DebugContext
+from torch._inductor.virtualized import V
+
+"""
+Use a global variable to hack the compile_fx_inner and record the compiled code.
+This works in single process problem, but requires careful review in multi-processing
+"""
+
+
+class FakeCallableWithCode():
+    code = None
+
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def set_code(self, code):
+        self.code = code
+
+
+fake_callable = FakeCallableWithCode()
+
+
+@DebugContext.wrap
+@torch.utils._python_dispatch._disable_current_modes()
+def compile_fx_inner_cpu(
+        gm: torch.fx.GraphModule,
+        example_inputs: List[torch.Tensor],
+        cudagraphs=None,
+        num_fixed=0,
+        is_backward=False,
+        graph_id=None,
+):
+    # lift the maximum depth of the Python interpreter stack
+    # to adapt large/deep models
+    compile_fx.sys.setrecursionlimit(max(compile_fx.sys.getrecursionlimit(), 2000))
+    V.debug.fx_graph(gm, example_inputs)
+    shape_env = compile_fx._shape_env_from_inputs(example_inputs)
+    fake_mode = compile_fx.fake_mode_from_tensors(example_inputs)
+    graph = compile_fx.GraphLowering(
+        gm,
+        shape_env=shape_env,
+        num_static_inputs=num_fixed,
+        graph_id=graph_id,
+        fake_mode=fake_mode,
+    )
+    with V.set_graph_handler(graph):
+        graph.run(*example_inputs)
+        code = graph.codegen()
+        fake_callable.set_code(code)
+
+    return fake_callable
+
+
+def assert_tuple_of_tensors(tensors):
+    assert isinstance(tensors, Tuple)
+    for tensor in tensors:
+        assert isinstance(tensor, torch.Tensor), 'Each element in tensors must be a torch.Tensor'
+
+
+from torch._subclasses import FakeTensor, FakeTensorMode
+
+
+def extract_inductor_code(kernel, example_inputs):
+    # check kernel input and output. All the input must be a Tensor. The output must be a tuple of Tensor
+    # TODO: remove this constraints (long term)
+    assert isinstance(example_inputs, (List, Tuple))
+    example_inputs = tuple(example_inputs)
+    assert_tuple_of_tensors(example_inputs)
+    fake_mode = FakeTensorMode()
+    fake_example_inputs = [FakeTensor.from_tensor(t, fake_mode=fake_mode) for t in example_inputs]
+    fake_output = kernel(*fake_example_inputs)
+    assert_tuple_of_tensors(fake_output)
+
+    model = fx.symbolic_trace(kernel)
+    compile_fx.compile_fx(model, example_inputs_=fake_example_inputs, inner_compile=compile_fx_inner_cpu)
+
+    code = fake_callable.code
+
+    # By default, Pytorch compiles a Python module with all the C++ kernel with unified name kernel.
+    # The actual kernel name should be kernel.__name__.
+    # TODO: fix this after rewriting inductor codegen to all C++ instead of a Python module
+    kernel_name = kernel.__name__
+
+    # fake_output is used
+    return code, kernel_name, fake_output
diff --git a/python/matx/torch_compiler/codegen/matx_formatter.py b/python/matx/torch_compiler/codegen/matx_formatter.py
index 36cce17f..0e740fc9 100644
--- a/python/matx/torch_compiler/codegen/matx_formatter.py
+++ b/python/matx/torch_compiler/codegen/matx_formatter.py
@@ -35,6 +35,7 @@
 
 MATX_INCLUDE = '''
 #include "matxscript/runtime/codegen_all_includes.h"
+#include "matxscript/runtime/container/ndarray_helper.h"
 #include <math.h>
 
 using namespace ::matxscript::runtime;
@@ -250,7 +251,7 @@ def generate_kernel_wrapper_body(kernel_declaration: cpp_parse.CPPDeclaration,
     delimiter = ',\n' + ' ' * 10
     kernel_invoke_param_str = delimiter.join(kernel_invoke_param)
     kernel_invoke_str = kernel_declaration.func_name + '(' + '\n' + ' ' * num_space + \
-                        kernel_invoke_param_str + '\n' + ')' + '\n'
+                        kernel_invoke_param_str + '\n' + ');' + '\n'
 
     # step 3: return output as a Tuple
     return_str = generate_kernel_wrapper_return(fake_output)
@@ -269,7 +270,7 @@ def matx_cpp_code_format(code: str, kernel_name: str,
 
     include_code_str, kernel_code_str = split_include_kernel(code)
     # add matx include
-    include_code_str += MATX_INCLUDE
+    include_code_str = MATX_INCLUDE
 
     # extract kernel declaration
     kernel_declaration_str, kernel_body_str = split_declaration_body(kernel_code_str)
@@ -278,7 +279,7 @@ def matx_cpp_code_format(code: str, kernel_name: str,
     kernel_return_type = kernel_declaration.return_type.name
     assert kernel_return_type == 'void', f'The kernel return type must be void, Got {kernel_return_type}'
 
-    kernel_declaration.func_name += MAGIC_NUMBER
+    kernel_declaration.func_name += MAGIC_NUMBER  # TODO: currently, we simply add magic number to avoid conflict
     kernel_code_str = str(kernel_declaration) + kernel_body_str
 
     # here, we keep the original kernel and add a wrapper
@@ -291,14 +292,14 @@ def matx_cpp_code_format(code: str, kernel_name: str,
     kernel_wrapper_declaration_with_default.append_arg(SESSION_HANLDER_WITH_DEAFULT)
 
     # create all the declarations strings
-    function_declaration = [CREATE_NDARRAY_DECLARATION, str(kernel_wrapper_declaration_with_default),
+    function_declaration = [CREATE_NDARRAY_DECLARATION, str(kernel_wrapper_declaration_with_default) + ';',
                             str(kernel_declaration) + ';', get_c_api_declare(kernel_wrapper_declaration.func_name)]
 
     function_declaration_str = '\n\n'.join(function_declaration) + '\n'
 
     # create all the kernel implementation strings including
     # 1. create ndarray. 2. kernel wrapper, 3. kernel, 4. kernel-c-api
-    kernel_wrapper = str(kernel_wrapper_declaration) + kernel_wrapper_body
+    kernel_wrapper = str(kernel_wrapper_declaration_without_default) + kernel_wrapper_body
     kernel_c_api_impl_str = get_c_api(kernel_name=kernel_wrapper_declaration.func_name,
                                       args=kernel_wrapper_declaration.args,
                                       has_return_value=kernel_wrapper_declaration.return_type.name != 'void')
@@ -311,7 +312,7 @@ def matx_cpp_code_format(code: str, kernel_name: str,
     kernel_code_str = '\n\n'.join(kernel_code_str)
 
     # registration str
-    registration_code_str = get_registration_str(kernel_name=kernel_declaration.func_name)
+    registration_code_str = get_registration_str(kernel_name=kernel_wrapper_declaration.func_name)
 
     # final code
     final_code = [include_code_str, kernel_code_str, registration_code_str]
diff --git a/python/matx/torch_compiler/tests/nested_inputs.py b/python/matx/torch_compiler/tests/nested_inputs.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/python/matx/torch_compiler/tests/simple_inductor.py b/python/matx/torch_compiler/tests/simple_inductor.py
index a48ed2cf..1e7b7255 100644
--- a/python/matx/torch_compiler/tests/simple_inductor.py
+++ b/python/matx/torch_compiler/tests/simple_inductor.py
@@ -1,10 +1,13 @@
 import json
 
+import numpy as np
+
 import matx
 import torch
 
 
-@matx.inductor(example_inputs=[torch.randn(5), torch.randn(5)])
+@matx.inductor(example_inputs=[torch.from_numpy(np.random.randn(5).astype(np.int32)),
+                               torch.from_numpy(np.random.randn(5).astype(np.int32))])
 def add_relu(a: matx.NDArray, b: matx.NDArray):
     c = a + b
     c = torch.nn.functional.relu(c)
@@ -19,11 +22,10 @@ def add_json(a: str, b: str) -> str:
     a_list = json.loads(a)
     b_list = json.loads(b)
 
-    a_tensor = matx.NDArray(arr=a_list, shape=[5], dtype='float32')
-    b_tensor = matx.NDArray(arr=b_list, shape=[5], dtype='float32')
-    c_tensor = matx.NDArray(arr=b_list, shape=[5], dtype='float32')
+    a_tensor = matx.NDArray(arr=a_list, shape=[5], dtype='int32')
+    b_tensor = matx.NDArray(arr=b_list, shape=[5], dtype='int32')
 
-    add_relu(a_tensor, b_tensor, c_tensor)
+    c_tensor = add_relu(a_tensor, b_tensor)[0]
 
     result_lst = c_tensor.tolist()
 
@@ -31,6 +33,11 @@ def add_json(a: str, b: str) -> str:
 
 
 if __name__ == '__main__':
+    a_tensor = matx.NDArray(arr=[1, 2, 3, 4, 5], shape=[5], dtype='int32')
+    b_tensor = matx.NDArray(arr=[6, 7, 8, 8, 10], shape=[5], dtype='int32')
+    c_tensor = add_relu(a_tensor, b_tensor)
+    print(c_tensor)
+
     print(f'Pytorch version {torch.__version__}')
     a = json.dumps([1, 2, 3, 4, 5])
     b = json.dumps([6, 7, 8, 9, 10])
diff --git a/python/matx/torch_compiler/tests/tuple_output.py b/python/matx/torch_compiler/tests/tuple_output.py
new file mode 100644
index 00000000..3e80cba4
--- /dev/null
+++ b/python/matx/torch_compiler/tests/tuple_output.py
@@ -0,0 +1,8 @@
+import matx
+
+from typing import Tuple
+
+
+@matx.script
+def func(a: int, b: int) -> Tuple[int, int]:
+    return a, b

From df1f9bcb74e552e61e92b1ee7efe837f2a25516a Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Wed, 1 Feb 2023 17:03:08 +0800
Subject: [PATCH 09/21] add license, add basic tests

---
 python/matx/__init__.py                       |  3 +-
 python/matx/inductor/__init__.py              | 19 +++++
 python/matx/torch_compiler/__init__.py        | 19 +++++
 .../matx/torch_compiler/codegen/__init__.py   | 19 +++++
 .../codegen/inductor/__init__.py              | 19 +++++
 .../torch_compiler/codegen/utils/__init__.py  | 18 +++++
 .../torch_compiler/codegen/utils/cpp_parse.py | 19 +++++
 .../matx/torch_compiler/tests/tuple_output.py |  8 ---
 test/inductor/test_basic.py                   | 69 +++++++++++++++++++
 9 files changed, 184 insertions(+), 9 deletions(-)
 delete mode 100644 python/matx/torch_compiler/tests/tuple_output.py

diff --git a/python/matx/__init__.py b/python/matx/__init__.py
index 78cb0d35..4bab2723 100644
--- a/python/matx/__init__.py
+++ b/python/matx/__init__.py
@@ -351,7 +351,7 @@ def script(compiling_obj, *args, backend=None, **kwargs):
         return toolchain.script(compiling_obj, *args, **kwargs)
 
 
-def inductor(example_inputs, **kwargs):
+def inductor_script(example_inputs, **kwargs):
     """
 
     Args:
@@ -361,6 +361,7 @@ def inductor(example_inputs, **kwargs):
     Returns: a wrapper that compiles the compiling_obj into a JIT FUNCTION
 
     """
+
     def inner_inductor(compiling_obj):
         return toolchain.inductor(compiling_obj, example_inputs, **kwargs)
 
diff --git a/python/matx/inductor/__init__.py b/python/matx/inductor/__init__.py
index e19f2c57..c09dd4a5 100644
--- a/python/matx/inductor/__init__.py
+++ b/python/matx/inductor/__init__.py
@@ -1,3 +1,22 @@
+# Copyright 2022 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 import inspect
 from typing import List
 
diff --git a/python/matx/torch_compiler/__init__.py b/python/matx/torch_compiler/__init__.py
index e83ac962..2fe03049 100644
--- a/python/matx/torch_compiler/__init__.py
+++ b/python/matx/torch_compiler/__init__.py
@@ -1,3 +1,22 @@
+# Copyright 2022 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 minimum_torch_version = '2.0.0a0'
 
 try:
diff --git a/python/matx/torch_compiler/codegen/__init__.py b/python/matx/torch_compiler/codegen/__init__.py
index 6935d78a..1dd70d50 100644
--- a/python/matx/torch_compiler/codegen/__init__.py
+++ b/python/matx/torch_compiler/codegen/__init__.py
@@ -1,2 +1,21 @@
+# Copyright 2022 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 from .inductor import extract_inductor_code
 from .matx_formatter import matx_cpp_code_format
\ No newline at end of file
diff --git a/python/matx/torch_compiler/codegen/inductor/__init__.py b/python/matx/torch_compiler/codegen/inductor/__init__.py
index 837d4448..95d543b2 100644
--- a/python/matx/torch_compiler/codegen/inductor/__init__.py
+++ b/python/matx/torch_compiler/codegen/inductor/__init__.py
@@ -1,3 +1,22 @@
+# Copyright 2022 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 from typing import List, Tuple
 
 import torch
diff --git a/python/matx/torch_compiler/codegen/utils/__init__.py b/python/matx/torch_compiler/codegen/utils/__init__.py
index e69de29b..84bf20b4 100644
--- a/python/matx/torch_compiler/codegen/utils/__init__.py
+++ b/python/matx/torch_compiler/codegen/utils/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2022 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
\ No newline at end of file
diff --git a/python/matx/torch_compiler/codegen/utils/cpp_parse.py b/python/matx/torch_compiler/codegen/utils/cpp_parse.py
index e0d9611d..499c0b9f 100644
--- a/python/matx/torch_compiler/codegen/utils/cpp_parse.py
+++ b/python/matx/torch_compiler/codegen/utils/cpp_parse.py
@@ -1,3 +1,22 @@
+# Copyright 2022 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 import dataclasses
 from typing import List, Union
 
diff --git a/python/matx/torch_compiler/tests/tuple_output.py b/python/matx/torch_compiler/tests/tuple_output.py
deleted file mode 100644
index 3e80cba4..00000000
--- a/python/matx/torch_compiler/tests/tuple_output.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import matx
-
-from typing import Tuple
-
-
-@matx.script
-def func(a: int, b: int) -> Tuple[int, int]:
-    return a, b
diff --git a/test/inductor/test_basic.py b/test/inductor/test_basic.py
index e69de29b..e60c0a29 100644
--- a/test/inductor/test_basic.py
+++ b/test/inductor/test_basic.py
@@ -0,0 +1,69 @@
+# Copyright 2022 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import unittest
+import matx
+import torch
+import numpy as np
+
+
+class BasicTests(unittest.TestCase):
+    def test_basics(self):
+        from matx import toolchain
+        toolchain.USE_SO_CACHE = False
+
+        def add_relu(a, b):
+            c = a + b
+            c = torch.nn.functional.relu(c)
+            return c,
+
+        sizes = [(5,), (10,), (2, 3), (4, 5, 6)]
+        dtypes = [np.float32, np.float64, np.int32, np.int64]
+
+        for size in sizes:
+            for dtype in dtypes:
+                a_numpy = np.random.randn(*size).astype(dtype)
+                b_numpy = np.random.randn(*size).astype(dtype)
+
+                example_inputs = [torch.from_numpy(np.random.randn(*size).astype(dtype)),
+                                  torch.from_numpy(np.random.randn(*size).astype(dtype))]
+
+                add_relu_kernel = matx.inductor_script(example_inputs)(add_relu)
+
+                a_tensor = torch.from_numpy(a_numpy)
+                b_tensor = torch.from_numpy(b_numpy)
+
+                a_ndarray = matx.NDArray([], a_numpy.shape, str(a_numpy.dtype))
+                a_ndarray.from_numpy(a_numpy)
+                b_ndarray = matx.NDArray([], b_numpy.shape, str(b_numpy.dtype))
+                b_ndarray.from_numpy(b_numpy)
+
+                c_tensor_expected = add_relu(a_tensor, b_tensor)[0]
+                c_ndarray: matx.NDArray = add_relu_kernel(a_ndarray, b_ndarray)[0]
+                c_tensor = c_ndarray.torch()
+
+                # TODO: there seems a strange cache behavior of JITOp, without the following line,
+                # it fails.
+                del add_relu_kernel
+
+                torch.testing.assert_close(c_tensor_expected, c_tensor)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 6aebd22806e335c3870016d3b2adb29373e8ba2e Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Fri, 3 Feb 2023 16:48:49 +0800
Subject: [PATCH 10/21] update codegen to match new NDArray impl

---
 python/matx/pipeline/_register_conveter.py    | 15 +++
 python/matx/toolchain.py                      | 30 +++---
 .../matx/torch_compiler/codegen/__init__.py   |  2 +-
 .../codegen/inductor/__init__.py              |  5 +-
 .../torch_compiler/codegen/matx_formatter.py  | 97 ++++++++++++-------
 .../torch_compiler/codegen/utils/__init__.py  |  2 +-
 .../torch_compiler/tests/simple_inductor.py   |  7 +-
 test/inductor/test_basic.py                   | 16 ++-
 8 files changed, 111 insertions(+), 63 deletions(-)

diff --git a/python/matx/pipeline/_register_conveter.py b/python/matx/pipeline/_register_conveter.py
index 10df789f..c28e7246 100644
--- a/python/matx/pipeline/_register_conveter.py
+++ b/python/matx/pipeline/_register_conveter.py
@@ -17,6 +17,17 @@
 # specific language governing permissions and limitations
 # under the License.
 
+
+try:
+    # TODO: consider lazy import this after users called matx.inductor_script
+    import torch
+    import torch.utils.dlpack
+
+    HAS_TORCH = True
+except:
+    HAS_TORCH = False
+
+import matx
 from .._ffi._selector import _set_fast_pipeline_object_converter
 from .._ffi._selector import _set_class_symbol
 from .symbol import BaseSymbol
@@ -29,9 +40,13 @@ def _pipeline_object_converter(value):
         return value.native_op
     if isinstance(value, OpKernel):
         return value.native_op
+    if HAS_TORCH and isinstance(value, torch.Tensor):
+        return matx.array.from_dlpack(torch.utils.dlpack.to_dlpack(value))
     return value
 
 
 _PipelineClasses = (JitObject, OpKernel,)
+if HAS_TORCH:
+    _PipelineClasses += (torch.Tensor,)
 _set_fast_pipeline_object_converter(_PipelineClasses, _pipeline_object_converter)
 _set_class_symbol(BaseSymbol)
diff --git a/python/matx/toolchain.py b/python/matx/toolchain.py
index 557bc278..9018fbce 100644
--- a/python/matx/toolchain.py
+++ b/python/matx/toolchain.py
@@ -389,24 +389,24 @@ def inductor(compiling_obj, example_inputs, *, share=True, toolchain=None, bundl
     if DISABLE_SCRIPT:
         return compiling_obj
 
-    from matx.inductor import from_source
+    from .inductor import from_source
 
     result: context.ScriptContext = from_source(compiling_obj, example_inputs)
 
-    # TODO: get Pytorch additional compiler flags. Hardcode here for mvp
-    torch_compiler_options = [
-        '-I/Users/bytedance/miniforge3/envs/inductor/lib/python3.10/site-packages/torch/include',
-        '-I/Users/bytedance/miniforge3/envs/inductor/lib/python3.10/site-packages/torch/include/torch/csrc/api/include',
-        '-I/Users/bytedance/miniforge3/envs/inductor/lib/python3.10/site-packages/torch/include/TH',
-        '-I/Users/bytedance/miniforge3/envs/inductor/lib/python3.10/site-packages/torch/include/THC',
-        '-I/Users/bytedance/miniforge3/envs/inductor/include/python3.10',
-        # '-lgomp',
-        # '-march=native',
-        '-ffast-math',
-        '-fno-finite-math-only',
-        # '-fopenmp',
-        '-DC10_USING_CUSTOM_GENERATED_MACROS'
-    ]
+    from torch._inductor import codecache
+    ipaths, lpaths, libs, macros = codecache.get_include_and_linking_paths(include_pytorch=False)
+
+    # TODO: check whether the following flags are handled by common flags
+    # codecache.get_shared()
+    # codecache.optimization_flags()
+    # codecache.cpp_flags()
+    # codecache.get_warning_all_flag()
+    # codecache.use_custom_generated_macros()
+
+    torch_compiler_options = ipaths.split() + lpaths.split() + libs.split() + macros.split()
+
+    # TODO: fix this on macOS m1.
+    torch_compiler_options.remove('-lgomp')
 
     build_dso(result, toolchain is not None, compile_options=torch_compiler_options)
     if toolchain is not None:
diff --git a/python/matx/torch_compiler/codegen/__init__.py b/python/matx/torch_compiler/codegen/__init__.py
index 1dd70d50..9ad89473 100644
--- a/python/matx/torch_compiler/codegen/__init__.py
+++ b/python/matx/torch_compiler/codegen/__init__.py
@@ -18,4 +18,4 @@
 # under the License.
 
 from .inductor import extract_inductor_code
-from .matx_formatter import matx_cpp_code_format
\ No newline at end of file
+from .matx_formatter import matx_cpp_code_format
diff --git a/python/matx/torch_compiler/codegen/inductor/__init__.py b/python/matx/torch_compiler/codegen/inductor/__init__.py
index 95d543b2..a85e6c16 100644
--- a/python/matx/torch_compiler/codegen/inductor/__init__.py
+++ b/python/matx/torch_compiler/codegen/inductor/__init__.py
@@ -96,7 +96,10 @@ def extract_inductor_code(kernel, example_inputs):
     assert_tuple_of_tensors(fake_output)
 
     model = fx.symbolic_trace(kernel)
-    compile_fx.compile_fx(model, example_inputs_=fake_example_inputs, inner_compile=compile_fx_inner_cpu)
+    compile_fx.compile_fx(
+        model,
+        example_inputs_=fake_example_inputs,
+        inner_compile=compile_fx_inner_cpu)
 
     code = fake_callable.code
 
diff --git a/python/matx/torch_compiler/codegen/matx_formatter.py b/python/matx/torch_compiler/codegen/matx_formatter.py
index 0e740fc9..f583c705 100644
--- a/python/matx/torch_compiler/codegen/matx_formatter.py
+++ b/python/matx/torch_compiler/codegen/matx_formatter.py
@@ -35,7 +35,6 @@
 
 MATX_INCLUDE = '''
 #include "matxscript/runtime/codegen_all_includes.h"
-#include "matxscript/runtime/container/ndarray_helper.h"
 #include <math.h>
 
 using namespace ::matxscript::runtime;
@@ -49,25 +48,19 @@
 
 SESSION_HANLDER = cpp_parse.CPPArg(name=f'handle_{MAGIC_NUMBER}',
                                    type=cpp_parse.CPPType(name='void', is_pointer=True))
-SESSION_HANLDER_WITH_DEAFULT = cpp_parse.CPPArg(name=f'handle_{MAGIC_NUMBER}',
-                                                type=cpp_parse.CPPType(name='void', is_pointer=True),
-                                                default_val='((void*)(int64_t)0)')
-
-CREATE_NDARRAY_DECLARATION = '''
-// helper function to create NDArray
-NDArray createNDArray(const std::string& dtype,
-                      const std::string& device,
-                      const std::vector<int64_t>& arg_shape);
-'''
+SESSION_HANLDER_WITH_DEAFULT = cpp_parse.CPPArg(
+    name=f'handle_{MAGIC_NUMBER}', type=cpp_parse.CPPType(
+        name='void', is_pointer=True), default_val='((void*)(int64_t)0)')
 
 CREATE_NDARRAY_IMPLEMENTATION = '''
-NDArray createNDArray(const std::string& dtype,
-                      const std::string& device,
-                      const std::vector<int64_t>& arg_shape) {
+NDArray createNDArray(const std::string& dtype, const std::string& device, const List& arg_shape) {
   Unicode dtype_str(UTF8Decode(dtype));
   Unicode ctx_str(UTF8Decode(device));
-  DataType data_type(String2DLDataType(UTF8Encode(dtype_str.view())));
-  return NDArray::Empty(arg_shape, data_type, NDArrayHelper::GetDevice(ctx_str));
+
+  auto a = Kernel_NDArray::make(0., arg_shape, dtype_str, ctx_str);
+  // set impl to torch.Tensor
+  a.SetImpl(NDArray::Impl::torchTensor);
+  return a;
 }
 '''
 
@@ -88,13 +81,13 @@ def get_c_api(kernel_name: str, args: List[cpp_parse.CPPArg], has_return_value)
     RTView pos_args[{}];
     helper.unpack(pos_args, args, num_args);  // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5
 
-    auto ret = {}({}, 
+    auto ret = {}({},
                 {}resource_handle);
     RTValue(std::move(ret)).MoveToCHost(out_ret_value);
   }} else {{
     switch(num_args) {{
       case {}: {{
-        auto ret = {}({}, 
+        auto ret = {}({},
                     {}resource_handle);  // /Users/bytedance/Developer/open_source_library/matxscript/examples/simple_function.py:5
         RTValue(std::move(ret)).MoveToCHost(out_ret_value);
       }} break;
@@ -128,9 +121,20 @@ def get_c_api(kernel_name: str, args: List[cpp_parse.CPPArg], has_return_value)
     pos_arg_cast = (',' + pos_arg_cast_indentation).join(pos_arg_cast_lst)
     args_t_cast = (',' + args_t_cast_indentation).join(args_t_cast_lst)
 
-    return template.format(kernel_name, num_args, arg_names_concat_str, kernel_name, num_args, num_args, kernel_name,
-                           pos_arg_cast, kernel_name_indentation, num_args, kernel_name,
-                           args_t_cast, kernel_name_indentation)
+    return template.format(
+        kernel_name,
+        num_args,
+        arg_names_concat_str,
+        kernel_name,
+        num_args,
+        num_args,
+        kernel_name,
+        pos_arg_cast,
+        kernel_name_indentation,
+        num_args,
+        kernel_name,
+        args_t_cast,
+        kernel_name_indentation)
 
 
 def get_registration_str(kernel_name):
@@ -183,8 +187,13 @@ def generate_kernel_wrapper_declaration(kernel_name, example_inputs):
     return_type = cpp_parse.CPPType(name='Tuple', is_pointer=False)
     args = []
     for i in range(len(example_inputs)):
-        arg = cpp_parse.CPPArg(name=f'in_ptr{i}', type=cpp_parse.CPPType(name='NDArray', is_pointer=False),
-                               is_const=False, is_restricted=False)
+        arg = cpp_parse.CPPArg(
+            name=f'in_ptr{i}',
+            type=cpp_parse.CPPType(
+                name='NDArray',
+                is_pointer=False),
+            is_const=False,
+            is_restricted=False)
         args.append(arg)
     kernel_wrapper_declaration = cpp_parse.CPPDeclaration(func_name=kernel_name,
                                                           return_type=return_type,
@@ -193,7 +202,11 @@ def generate_kernel_wrapper_declaration(kernel_name, example_inputs):
     return kernel_wrapper_declaration
 
 
-def generate_ndarray_allocate_statement(output_name: str, dtype: str, device: str, shape: List[int]):
+def generate_ndarray_allocate_statement(
+        output_name: str,
+        dtype: str,
+        device: str,
+        shape: List[int]):
     assert dtype in ['int32', 'int64', 'float32', 'float64']
     assert device == 'cpu'
     assert isinstance(shape, List)
@@ -251,7 +264,7 @@ def generate_kernel_wrapper_body(kernel_declaration: cpp_parse.CPPDeclaration,
     delimiter = ',\n' + ' ' * 10
     kernel_invoke_param_str = delimiter.join(kernel_invoke_param)
     kernel_invoke_str = kernel_declaration.func_name + '(' + '\n' + ' ' * num_space + \
-                        kernel_invoke_param_str + '\n' + ');' + '\n'
+        kernel_invoke_param_str + '\n' + ');' + '\n'
 
     # step 3: return output as a Tuple
     return_str = generate_kernel_wrapper_return(fake_output)
@@ -279,7 +292,8 @@ def matx_cpp_code_format(code: str, kernel_name: str,
     kernel_return_type = kernel_declaration.return_type.name
     assert kernel_return_type == 'void', f'The kernel return type must be void, Got {kernel_return_type}'
 
-    kernel_declaration.func_name += MAGIC_NUMBER  # TODO: currently, we simply add magic number to avoid conflict
+    # TODO: currently, we simply add magic number to avoid conflict
+    kernel_declaration.func_name += MAGIC_NUMBER
     kernel_code_str = str(kernel_declaration) + kernel_body_str
 
     # here, we keep the original kernel and add a wrapper
@@ -292,23 +306,38 @@ def matx_cpp_code_format(code: str, kernel_name: str,
     kernel_wrapper_declaration_with_default.append_arg(SESSION_HANLDER_WITH_DEAFULT)
 
     # create all the declarations strings
-    function_declaration = [CREATE_NDARRAY_DECLARATION, str(kernel_wrapper_declaration_with_default) + ';',
-                            str(kernel_declaration) + ';', get_c_api_declare(kernel_wrapper_declaration.func_name)]
+    CREATE_NDARRAY_DECLARATION = split_declaration_body(CREATE_NDARRAY_IMPLEMENTATION)[0] + ';'
+
+    function_declaration = [
+        CREATE_NDARRAY_DECLARATION,
+        str(kernel_wrapper_declaration_with_default) + ';',
+        str(kernel_declaration) + ';',
+        get_c_api_declare(
+            kernel_wrapper_declaration.func_name)]
 
     function_declaration_str = '\n\n'.join(function_declaration) + '\n'
 
     # create all the kernel implementation strings including
     # 1. create ndarray. 2. kernel wrapper, 3. kernel, 4. kernel-c-api
     kernel_wrapper = str(kernel_wrapper_declaration_without_default) + kernel_wrapper_body
-    kernel_c_api_impl_str = get_c_api(kernel_name=kernel_wrapper_declaration.func_name,
-                                      args=kernel_wrapper_declaration.args,
-                                      has_return_value=kernel_wrapper_declaration.return_type.name != 'void')
-
-    implementations = [CREATE_NDARRAY_IMPLEMENTATION, kernel_wrapper, kernel_code_str, kernel_c_api_impl_str]
+    kernel_c_api_impl_str = get_c_api(
+        kernel_name=kernel_wrapper_declaration.func_name,
+        args=kernel_wrapper_declaration.args,
+        has_return_value=kernel_wrapper_declaration.return_type.name != 'void')
+
+    implementations = [
+        CREATE_NDARRAY_IMPLEMENTATION,
+        kernel_wrapper,
+        kernel_code_str,
+        kernel_c_api_impl_str]
     implementations_str = '\n\n'.join(implementations) + '\n'
 
     # add namespace
-    kernel_code_str = ['namespace {', function_declaration_str, implementations_str, '} // namespace']
+    kernel_code_str = [
+        'namespace {',
+        function_declaration_str,
+        implementations_str,
+        '} // namespace']
     kernel_code_str = '\n\n'.join(kernel_code_str)
 
     # registration str
diff --git a/python/matx/torch_compiler/codegen/utils/__init__.py b/python/matx/torch_compiler/codegen/utils/__init__.py
index 84bf20b4..9e19ab85 100644
--- a/python/matx/torch_compiler/codegen/utils/__init__.py
+++ b/python/matx/torch_compiler/codegen/utils/__init__.py
@@ -15,4 +15,4 @@
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
-# under the License.
\ No newline at end of file
+# under the License.
diff --git a/python/matx/torch_compiler/tests/simple_inductor.py b/python/matx/torch_compiler/tests/simple_inductor.py
index 1e7b7255..f611743f 100644
--- a/python/matx/torch_compiler/tests/simple_inductor.py
+++ b/python/matx/torch_compiler/tests/simple_inductor.py
@@ -6,8 +6,8 @@
 import torch
 
 
-@matx.inductor(example_inputs=[torch.from_numpy(np.random.randn(5).astype(np.int32)),
-                               torch.from_numpy(np.random.randn(5).astype(np.int32))])
+@matx.inductor_script(example_inputs=[torch.from_numpy(np.random.randn(5).astype(np.int32)),
+                                      torch.from_numpy(np.random.randn(5).astype(np.int32))])
 def add_relu(a: matx.NDArray, b: matx.NDArray):
     c = a + b
     c = torch.nn.functional.relu(c)
@@ -35,6 +35,9 @@ def add_json(a: str, b: str) -> str:
 if __name__ == '__main__':
     a_tensor = matx.NDArray(arr=[1, 2, 3, 4, 5], shape=[5], dtype='int32')
     b_tensor = matx.NDArray(arr=[6, 7, 8, 8, 10], shape=[5], dtype='int32')
+
+    a_tensor = a_tensor.torch(copy=True)
+
     c_tensor = add_relu(a_tensor, b_tensor)
     print(c_tensor)
 
diff --git a/test/inductor/test_basic.py b/test/inductor/test_basic.py
index e60c0a29..5373a6af 100644
--- a/test/inductor/test_basic.py
+++ b/test/inductor/test_basic.py
@@ -24,7 +24,9 @@
 
 
 class BasicTests(unittest.TestCase):
+
     def test_basics(self):
+        # TODO: fix cache_hit issues.
         from matx import toolchain
         toolchain.USE_SO_CACHE = False
 
@@ -49,21 +51,17 @@ def add_relu(a, b):
                 a_tensor = torch.from_numpy(a_numpy)
                 b_tensor = torch.from_numpy(b_numpy)
 
-                a_ndarray = matx.NDArray([], a_numpy.shape, str(a_numpy.dtype))
-                a_ndarray.from_numpy(a_numpy)
-                b_ndarray = matx.NDArray([], b_numpy.shape, str(b_numpy.dtype))
-                b_ndarray.from_numpy(b_numpy)
-
                 c_tensor_expected = add_relu(a_tensor, b_tensor)[0]
-                c_ndarray: matx.NDArray = add_relu_kernel(a_ndarray, b_ndarray)[0]
-                c_tensor = c_ndarray.torch()
+                c_tensor = add_relu_kernel(a_tensor, b_tensor)[0]
 
-                # TODO: there seems a strange cache behavior of JITOp, without the following line,
-                # it fails.
+                # TODO: there seems a strange cache behavior of JITOp, without the
+                # following line, it fails.
                 del add_relu_kernel
 
                 torch.testing.assert_close(c_tensor_expected, c_tensor)
 
+        toolchain.USE_SO_CACHE = True
+
 
 if __name__ == '__main__':
     unittest.main()

From e6459c467621c01abfa50b4329976e601a74281d Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Mon, 6 Feb 2023 12:30:17 +0800
Subject: [PATCH 11/21] fix matx.inductor cache_hit

---
 python/matx/__init__.py                       |  2 +-
 python/matx/inductor/context/__init__.py      |  0
 .../script/analysis/build_type_analysis.py    |  2 +-
 .../matx/script/context/inductor_context.py   |  4 +-
 python/matx/{ => script}/inductor/__init__.py | 66 ++++++++++---------
 python/matx/script/inductor/tensor_spec.py    | 30 +++++++++
 python/matx/toolchain.py                      | 32 +++++++--
 test/inductor/test_basic.py                   | 13 +---
 8 files changed, 100 insertions(+), 49 deletions(-)
 delete mode 100644 python/matx/inductor/context/__init__.py
 rename python/matx/{ => script}/inductor/__init__.py (52%)
 create mode 100644 python/matx/script/inductor/tensor_spec.py

diff --git a/python/matx/__init__.py b/python/matx/__init__.py
index 034c4a72..697c6c19 100644
--- a/python/matx/__init__.py
+++ b/python/matx/__init__.py
@@ -351,7 +351,7 @@ def script(compiling_obj, *args, backend=None, **kwargs):
         return toolchain.script(compiling_obj, *args, **kwargs)
 
 
-def inductor_script(example_inputs, **kwargs):
+def inductor(example_inputs, **kwargs):
     """
 
     Args:
diff --git a/python/matx/inductor/context/__init__.py b/python/matx/inductor/context/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/python/matx/script/analysis/build_type_analysis.py b/python/matx/script/analysis/build_type_analysis.py
index 6cfd0257..7d43cb84 100644
--- a/python/matx/script/analysis/build_type_analysis.py
+++ b/python/matx/script/analysis/build_type_analysis.py
@@ -30,7 +30,7 @@ def run(self, sc_ctx: context.ScriptContext):
         node_ctx = sc_ctx.main_node.context
         if isinstance(node_ctx, context.ClassContext):
             build_type = context.BuildType.JIT_OBJECT
-        elif isinstance(node_ctx, context.FunctionContext):
+        elif isinstance(node_ctx, (context.FunctionContext, context.InductorContext)):
             build_type = context.BuildType.FUNCTION
         else:
             raise RuntimeError("Only one-function, one-class source code is allowed")
diff --git a/python/matx/script/context/inductor_context.py b/python/matx/script/context/inductor_context.py
index ec775a1f..cc3d1c99 100644
--- a/python/matx/script/context/inductor_context.py
+++ b/python/matx/script/context/inductor_context.py
@@ -20,11 +20,13 @@
 
 class InductorContext(object):
     def __init__(self,
-                 fn_name: str = '<unknown>', ):
+                 fn_name: str = '<unknown>',
+                 example_inputs_spec=None):
         self.fn_name = fn_name
         self.unbound_name = fn_name
         self.return_type = None
         self.arg_types = {}  # Deferred?
+        self.example_inputs_spec = example_inputs_spec
 
     @property
     def name(self):
diff --git a/python/matx/inductor/__init__.py b/python/matx/script/inductor/__init__.py
similarity index 52%
rename from python/matx/inductor/__init__.py
rename to python/matx/script/inductor/__init__.py
index c09dd4a5..9c75d904 100644
--- a/python/matx/inductor/__init__.py
+++ b/python/matx/script/inductor/__init__.py
@@ -22,48 +22,54 @@
 
 import torch
 
-from matx.env import MATX_DEV_MODE
-from matx.script import context
-from matx.toolchain import path_prefix
 from matx.torch_compiler.codegen import extract_inductor_code, matx_cpp_code_format
+from .tensor_spec import TensorSpec
+from .. import context, analysis
+from ... import _ffi
+from ... import ir
+from ...env import MATX_DEV_MODE
+
+
+def _embedded_inductor_ctx(compiling_obj, example_inputs):
+    code = _obtain_inductor_code(compiling_obj, example_inputs)
+    build_module = _ffi.get_global_func("embedded.build.c")
+    sc_ctx = context.ScriptContext()
+    sc_ctx.main_node.raw = compiling_obj
+    if isinstance(code, str):
+        code = code.encode()
+    sc_ctx.rt_module = build_module(code)
+    example_inputs_spec = [TensorSpec.from_tensor(inputs) for inputs in example_inputs]
+    sc_ctx.main_node.context = context.InductorContext(fn_name=compiling_obj.__name__,
+                                                       example_inputs_spec=example_inputs_spec)
+    return sc_ctx
+
+
+def _pass(sc_ctx: context.ScriptContext):
+    src_anls = analysis.SourceAnalysis()
+    src_anls.run(sc_ctx)
+
+
+def _obtain_inductor_code(compiling_obj, example_inputs):
+    # compile the kernel and set the code
+    code, kernel_name, fake_output = extract_inductor_code(compiling_obj, example_inputs)
+    code = matx_cpp_code_format(code, kernel_name, example_inputs, fake_output)
+    return code
 
 
 def from_source(compiling_obj: type, example_inputs: List[torch.Tensor]) -> context.ScriptContext:
     try:
-        # set sc_ctx attributes to be compatible with existing matx code
-        sc_ctx = context.ScriptContext()
-        sc_ctx.build_type = context.BuildType.FUNCTION
-        sc_ctx.main_node.raw = compiling_obj
-        inductor_context = context.InductorContext(fn_name=compiling_obj.__name__)
-        sc_ctx.main_node.context = inductor_context
-        # set source code TODO: formatting source code
-        sc_ctx.main_node.span.source_code = inspect.getsource(compiling_obj)
-        # set filename. TODO: this is too hack
-        frame = inspect.stack()[3]
-        sc_ctx.main_node.span.file_name = frame[0].f_code.co_filename
+        # TODO: allow generalized way to specify example_inputs
+        sc_ctx = _embedded_inductor_ctx(compiling_obj, example_inputs)
+        # set filename.
+        _pass(sc_ctx)
+        analysis.BuildTypeAnalysis().run(sc_ctx)
 
         # set args types.
-        from .. import ir
-
         # TODO: currently, we only support argument as NDArray. We may support nested inputs later
         signature = inspect.signature(compiling_obj)
         for param in signature.parameters.values():
             sc_ctx.main_node.context.arg_types[param.name] = ir.type.NDArrayType()
 
-        # compile the kernel and set the code
-        code, kernel_name, fake_output = extract_inductor_code(compiling_obj, example_inputs)
-        code = matx_cpp_code_format(code, kernel_name, example_inputs, fake_output)
-
-        # export code
-        path = path_prefix(sc_ctx)
-        with open(path, 'w') as f:
-            f.write(code)
-
-        # set rt_module
-        from .. import _ffi
-        build_module = _ffi.get_global_func("embedded.build.c")
-        sc_ctx.rt_module = build_module(code.encode())
-
         return sc_ctx
     except BaseException as e:
         if MATX_DEV_MODE:
diff --git a/python/matx/script/inductor/tensor_spec.py b/python/matx/script/inductor/tensor_spec.py
new file mode 100644
index 00000000..bdee4a73
--- /dev/null
+++ b/python/matx/script/inductor/tensor_spec.py
@@ -0,0 +1,30 @@
+def convert_torch_dtype(dtype):
+    import torch
+    table = {
+        torch.int32: 'int32',
+        torch.int64: 'int64',
+        torch.float32: 'float32',
+        torch.float64: 'float64'
+    }
+    if dtype not in table:
+        raise NotImplementedError(f'Unsupport torch.Tensor dtype {dtype}')
+
+    return table[dtype]
+
+
+class TensorSpec(object):
+    def __init__(self, shape, dtype):
+        self.shape = tuple(shape)
+        self.dtype = dtype
+
+    @classmethod
+    def from_tensor(cls, tensor):
+        import torch
+        assert isinstance(tensor, torch.Tensor)
+        return cls(shape=tuple(tensor.shape), dtype=convert_torch_dtype(tensor.dtype))
+
+    def __str__(self):
+        return str(self.shape) + ', ' + self.dtype
+
+    def __repr__(self):
+        return f'TensorSpec({str(self)})'
diff --git a/python/matx/toolchain.py b/python/matx/toolchain.py
index 9018fbce..96519f9b 100644
--- a/python/matx/toolchain.py
+++ b/python/matx/toolchain.py
@@ -252,6 +252,26 @@ def path_prefix(sc_ctx: context.ScriptContext):
                                                              cache_md5))
 
 
+def path_prefix_inductor(sc_ctx: context.ScriptContext):
+    """inductor path_prefix encodes meta info from example_inputs"""
+    # mkdir LIB_PATH
+    from .__init__ import __version__
+    _mk_lib_dir()
+    # code + sha1(libmatx.so) + commit_id(__version__)
+    dep_source_codes = "".join(dep_node.span.source_code for dep_node in sc_ctx.deps_node)
+    assert isinstance(sc_ctx.main_node.context, context.InductorContext)
+    example_inputs = sc_ctx.main_node.context.example_inputs_spec
+    example_inputs_str = ''.join([str(inputs) for inputs in example_inputs])
+    cache_str = sc_ctx.main_node.span.source_code + dep_source_codes + example_inputs_str + _LIB_SHA1 + __version__
+    cache_md5 = hashlib.md5(cache_str.encode()).hexdigest()[:16]
+    file_name = os.path.splitext(os.path.basename(sc_ctx.main_node.span.file_name))[0]
+    return os.path.abspath('{}/lib{}_{}_{}_plugin_{}'.format(LIB_PATH,
+                                                             file_name,
+                                                             sc_ctx.main_node.span.lineno,
+                                                             sc_ctx.main_node.context.name,
+                                                             cache_md5))
+
+
 def toolchain_path_prefix(sc_ctx: context.ScriptContext, toolchain_str: str):
     from .__init__ import __version__
     # mkdir LIB_PATH
@@ -297,10 +317,13 @@ def toolchain_build(sc_ctx: context.ScriptContext, toolchain: ToolChain):
         sc_ctx.dso_path = (sc_ctx.dso_path[0], so_path)
 
 
-def build_dso(sc_ctx: context.ScriptContext, use_toolchain=False, compile_options=None):
+def build_dso(sc_ctx: context.ScriptContext, use_toolchain=False, compile_options=None, make_path_prefix=None):
     rt_mod = sc_ctx.rt_module
     main_node_name = sc_ctx.main_node.context.name
-    base_path = path_prefix(sc_ctx)
+    if make_path_prefix is None:
+        make_path_prefix = path_prefix
+
+    base_path = make_path_prefix(sc_ctx)
 
     with contrib.util.filelock(base_path):
         sopath = base_path + '.so'
@@ -389,7 +412,7 @@ def inductor(compiling_obj, example_inputs, *, share=True, toolchain=None, bundl
     if DISABLE_SCRIPT:
         return compiling_obj
 
-    from .inductor import from_source
+    from .script.inductor import from_source
 
     result: context.ScriptContext = from_source(compiling_obj, example_inputs)
 
@@ -408,7 +431,8 @@ def inductor(compiling_obj, example_inputs, *, share=True, toolchain=None, bundl
     # TODO: fix this on macOS m1.
     torch_compiler_options.remove('-lgomp')
 
-    build_dso(result, toolchain is not None, compile_options=torch_compiler_options)
+    build_dso(result, toolchain is not None, compile_options=torch_compiler_options,
+              make_path_prefix=path_prefix_inductor)
     if toolchain is not None:
         toolchain_build(result, toolchain)
 
diff --git a/test/inductor/test_basic.py b/test/inductor/test_basic.py
index 5373a6af..f28f981d 100644
--- a/test/inductor/test_basic.py
+++ b/test/inductor/test_basic.py
@@ -26,10 +26,6 @@
 class BasicTests(unittest.TestCase):
 
     def test_basics(self):
-        # TODO: fix cache_hit issues.
-        from matx import toolchain
-        toolchain.USE_SO_CACHE = False
-
         def add_relu(a, b):
             c = a + b
             c = torch.nn.functional.relu(c)
@@ -46,22 +42,15 @@ def add_relu(a, b):
                 example_inputs = [torch.from_numpy(np.random.randn(*size).astype(dtype)),
                                   torch.from_numpy(np.random.randn(*size).astype(dtype))]
 
-                add_relu_kernel = matx.inductor_script(example_inputs)(add_relu)
+                add_relu_kernel = matx.inductor(example_inputs)(add_relu)
 
                 a_tensor = torch.from_numpy(a_numpy)
                 b_tensor = torch.from_numpy(b_numpy)
 
                 c_tensor_expected = add_relu(a_tensor, b_tensor)[0]
                 c_tensor = add_relu_kernel(a_tensor, b_tensor)[0]
-
-                # TODO: there seems a strange cache behavior of JITOp, without the
-                # following line, it fails.
-                del add_relu_kernel
-
                 torch.testing.assert_close(c_tensor_expected, c_tensor)
 
-        toolchain.USE_SO_CACHE = True
-
 
 if __name__ == '__main__':
     unittest.main()

From f6bc9c5bf18b367825b3d0130a4079412599cb8a Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Mon, 6 Feb 2023 12:31:35 +0800
Subject: [PATCH 12/21] fix py codestyle

---
 python/matx/toolchain.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/matx/toolchain.py b/python/matx/toolchain.py
index 96519f9b..e01b2299 100644
--- a/python/matx/toolchain.py
+++ b/python/matx/toolchain.py
@@ -262,7 +262,8 @@ def path_prefix_inductor(sc_ctx: context.ScriptContext):
     assert isinstance(sc_ctx.main_node.context, context.InductorContext)
     example_inputs = sc_ctx.main_node.context.example_inputs_spec
     example_inputs_str = ''.join([str(inputs) for inputs in example_inputs])
-    cache_str = sc_ctx.main_node.span.source_code + dep_source_codes + example_inputs_str + _LIB_SHA1 + __version__
+    cache_str = sc_ctx.main_node.span.source_code + \
+                dep_source_codes + example_inputs_str + _LIB_SHA1 + __version__
     cache_md5 = hashlib.md5(cache_str.encode()).hexdigest()[:16]
     file_name = os.path.splitext(os.path.basename(sc_ctx.main_node.span.file_name))[0]
     return os.path.abspath('{}/lib{}_{}_{}_plugin_{}'.format(LIB_PATH,
@@ -317,7 +318,10 @@ def toolchain_build(sc_ctx: context.ScriptContext, toolchain: ToolChain):
         sc_ctx.dso_path = (sc_ctx.dso_path[0], so_path)
 
 
-def build_dso(sc_ctx: context.ScriptContext, use_toolchain=False, compile_options=None, make_path_prefix=None):
+def build_dso(sc_ctx: context.ScriptContext,
+              use_toolchain=False,
+              compile_options=None,
+              make_path_prefix=None):
     rt_mod = sc_ctx.rt_module
     main_node_name = sc_ctx.main_node.context.name
     if make_path_prefix is None:

From 54bde844ef092817cac321bceb5061cccdbd1017 Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Mon, 6 Feb 2023 12:31:51 +0800
Subject: [PATCH 13/21] fix py codestyle

---
 python/matx/toolchain.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/matx/toolchain.py b/python/matx/toolchain.py
index e01b2299..422a0796 100644
--- a/python/matx/toolchain.py
+++ b/python/matx/toolchain.py
@@ -263,7 +263,7 @@ def path_prefix_inductor(sc_ctx: context.ScriptContext):
     example_inputs = sc_ctx.main_node.context.example_inputs_spec
     example_inputs_str = ''.join([str(inputs) for inputs in example_inputs])
     cache_str = sc_ctx.main_node.span.source_code + \
-                dep_source_codes + example_inputs_str + _LIB_SHA1 + __version__
+        dep_source_codes + example_inputs_str + _LIB_SHA1 + __version__
     cache_md5 = hashlib.md5(cache_str.encode()).hexdigest()[:16]
     file_name = os.path.splitext(os.path.basename(sc_ctx.main_node.span.file_name))[0]
     return os.path.abspath('{}/lib{}_{}_{}_plugin_{}'.format(LIB_PATH,

From 54bc1c004964abd6da9beec7ca9b387df05e5c8c Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Mon, 6 Feb 2023 13:11:09 +0800
Subject: [PATCH 14/21] add inductor to __all__ in matx.__init__

---
 python/matx/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/matx/__init__.py b/python/matx/__init__.py
index 697c6c19..63dddd4a 100644
--- a/python/matx/__init__.py
+++ b/python/matx/__init__.py
@@ -40,6 +40,7 @@
     "trace",
     "script",
     "script_embedded_class",
+    "inductor",
     "save",
     "load",
     "get_cflags",

From d8141131c3e7eb48fdf32442a57feb6adcbe98fc Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Mon, 6 Feb 2023 13:13:36 +0800
Subject: [PATCH 15/21] remove TODO in
 python/matx/pipeline/_register_conveter.py

---
 python/matx/pipeline/_register_conveter.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/matx/pipeline/_register_conveter.py b/python/matx/pipeline/_register_conveter.py
index c28e7246..7fca3493 100644
--- a/python/matx/pipeline/_register_conveter.py
+++ b/python/matx/pipeline/_register_conveter.py
@@ -19,7 +19,6 @@
 
 
 try:
-    # TODO: consider lazy import this after users called matx.inductor_script
     import torch
     import torch.utils.dlpack
 

From a4c590ca80fec4e26407b4c86cb0a3a0091e5368 Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Mon, 6 Feb 2023 13:20:57 +0800
Subject: [PATCH 16/21] add LICENSE

---
 python/matx/script/inductor/tensor_spec.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/python/matx/script/inductor/tensor_spec.py b/python/matx/script/inductor/tensor_spec.py
index bdee4a73..40a229fc 100644
--- a/python/matx/script/inductor/tensor_spec.py
+++ b/python/matx/script/inductor/tensor_spec.py
@@ -1,3 +1,22 @@
+# Copyright 2022 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 def convert_torch_dtype(dtype):
     import torch
     table = {

From 47cb1097d127c962c43212da798af865ae48d831 Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Mon, 6 Feb 2023 15:46:16 +0800
Subject: [PATCH 17/21] add inductor ci

---
 .github/workflows/test_py_inductor.yml | 26 ++++++++++++++
 ci/run_py_inductor_test.sh             | 50 ++++++++++++++++++++++++++
 2 files changed, 76 insertions(+)
 create mode 100644 .github/workflows/test_py_inductor.yml
 create mode 100644 ci/run_py_inductor_test.sh

diff --git a/.github/workflows/test_py_inductor.yml b/.github/workflows/test_py_inductor.yml
new file mode 100644
index 00000000..b671d168
--- /dev/null
+++ b/.github/workflows/test_py_inductor.yml
@@ -0,0 +1,26 @@
+name: Test Inductor
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.8'
+      - name: Prepare PyTorch 2.0 nightly
+        run: pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
+      - name: Echo GCC version
+        run: gcc --version
+      - name: Install MATXScript Requirements
+        run: pip3 install -r python/requirements.txt
+      - name: PyTorch Extension Test
+        run: bash ci/run_py_inductor_test.sh
diff --git a/ci/run_py_inductor_test.sh b/ci/run_py_inductor_test.sh
new file mode 100644
index 00000000..14a68c0f
--- /dev/null
+++ b/ci/run_py_inductor_test.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+# Copyright 2022 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -xue
+set -o pipefail
+
+THIS_PATH=$(cd $(dirname "$0"); pwd)
+ROOT_PATH=${THIS_PATH}/../
+
+###############################################################################
+# build all shared target
+###############################################################################
+cd "${ROOT_PATH}" || exit 1
+BUILD_TESTING=OFF BUILD_BENCHMARK=OFF CPPFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" bash ci/build_lib.sh
+
+###############################################################################
+# install requirements
+###############################################################################
+PYTHON_MODULE_PATH=${ROOT_PATH}/python
+cd "${PYTHON_MODULE_PATH}"
+pip3 install -r requirements.txt
+
+###############################################################################
+# find all test script
+###############################################################################
+PYTHONPATH=${PYTHONPATH:-}
+TEST_SCRIPT_PATH=${ROOT_PATH}/test/inductor
+cd "${TEST_SCRIPT_PATH}"
+# shellcheck disable=SC2045
+for script_file in $(ls test_*.py); do
+  echo "test script: ${script_file}"
+  PYTHONPATH="${ROOT_PATH}/python:${PYTHONPATH}" python3 "${script_file}"
+done

From 43de4248ad20690018695884350c6a5d3bd7d590 Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Mon, 6 Feb 2023 16:08:07 +0800
Subject: [PATCH 18/21] fix Pytorch version check

---
 .github/workflows/test_py_inductor.yml | 4 ++--
 python/matx/torch_compiler/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test_py_inductor.yml b/.github/workflows/test_py_inductor.yml
index b671d168..72d8436e 100644
--- a/.github/workflows/test_py_inductor.yml
+++ b/.github/workflows/test_py_inductor.yml
@@ -1,4 +1,4 @@
-name: Test Inductor
+name: Test Python Inductor
 
 on:
   push:
@@ -22,5 +22,5 @@ jobs:
         run: gcc --version
       - name: Install MATXScript Requirements
         run: pip3 install -r python/requirements.txt
-      - name: PyTorch Extension Test
+      - name: Python Inductor Test
         run: bash ci/run_py_inductor_test.sh
diff --git a/python/matx/torch_compiler/__init__.py b/python/matx/torch_compiler/__init__.py
index 2fe03049..97fcd247 100644
--- a/python/matx/torch_compiler/__init__.py
+++ b/python/matx/torch_compiler/__init__.py
@@ -17,7 +17,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-minimum_torch_version = '2.0.0a0'
+minimum_torch_version = '2.0.0.dev'
 
 try:
     import torch

From 032da39e5b5ff6010f4715e8c800c2dc7e199121 Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Mon, 6 Feb 2023 16:47:18 +0800
Subject: [PATCH 19/21] update torch_compiler to match nightly 20230205

---
 .github/workflows/test_py_inductor.yml        |  2 +-
 python/matx/toolchain.py                      |  7 ++--
 .../codegen/inductor/__init__.py              | 32 ++++++++++++-------
 3 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/test_py_inductor.yml b/.github/workflows/test_py_inductor.yml
index 72d8436e..adc4c20b 100644
--- a/.github/workflows/test_py_inductor.yml
+++ b/.github/workflows/test_py_inductor.yml
@@ -17,7 +17,7 @@ jobs:
         with:
           python-version: '3.8'
       - name: Prepare PyTorch 2.0 nightly
-        run: pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
+        run: pip3 install --pre torch==2.0.0.dev20230205 --index-url https://download.pytorch.org/whl/nightly/cpu
       - name: Echo GCC version
         run: gcc --version
       - name: Install MATXScript Requirements
diff --git a/python/matx/toolchain.py b/python/matx/toolchain.py
index 422a0796..7c8724fc 100644
--- a/python/matx/toolchain.py
+++ b/python/matx/toolchain.py
@@ -262,8 +262,8 @@ def path_prefix_inductor(sc_ctx: context.ScriptContext):
     assert isinstance(sc_ctx.main_node.context, context.InductorContext)
     example_inputs = sc_ctx.main_node.context.example_inputs_spec
     example_inputs_str = ''.join([str(inputs) for inputs in example_inputs])
-    cache_str = sc_ctx.main_node.span.source_code + \
-        dep_source_codes + example_inputs_str + _LIB_SHA1 + __version__
+    cache_str = sc_ctx.main_node.span.source_code + dep_source_codes
+    cache_str += example_inputs_str + _LIB_SHA1 + __version__
     cache_md5 = hashlib.md5(cache_str.encode()).hexdigest()[:16]
     file_name = os.path.splitext(os.path.basename(sc_ctx.main_node.span.file_name))[0]
     return os.path.abspath('{}/lib{}_{}_{}_plugin_{}'.format(LIB_PATH,
@@ -433,7 +433,8 @@ def inductor(compiling_obj, example_inputs, *, share=True, toolchain=None, bundl
     torch_compiler_options = ipaths.split() + lpaths.split() + libs.split() + macros.split()
 
     # TODO: fix this on macOS m1.
-    torch_compiler_options.remove('-lgomp')
+    if '-lomp' in torch_compiler_options:
+        torch_compiler_options.remove('-lomp')
 
     build_dso(result, toolchain is not None, compile_options=torch_compiler_options,
               make_path_prefix=path_prefix_inductor)
diff --git a/python/matx/torch_compiler/codegen/inductor/__init__.py b/python/matx/torch_compiler/codegen/inductor/__init__.py
index a85e6c16..cc06f675 100644
--- a/python/matx/torch_compiler/codegen/inductor/__init__.py
+++ b/python/matx/torch_compiler/codegen/inductor/__init__.py
@@ -57,20 +57,28 @@ def compile_fx_inner_cpu(
     # lift the maximum depth of the Python interpreter stack
     # to adapt large/deep models
     compile_fx.sys.setrecursionlimit(max(compile_fx.sys.getrecursionlimit(), 2000))
+
     V.debug.fx_graph(gm, example_inputs)
+
     shape_env = compile_fx._shape_env_from_inputs(example_inputs)
-    fake_mode = compile_fx.fake_mode_from_tensors(example_inputs)
-    graph = compile_fx.GraphLowering(
-        gm,
-        shape_env=shape_env,
-        num_static_inputs=num_fixed,
-        graph_id=graph_id,
-        fake_mode=fake_mode,
-    )
-    with V.set_graph_handler(graph):
-        graph.run(*example_inputs)
-        code = graph.codegen()
-        fake_callable.set_code(code)
+    fake_mode = compile_fx.fake_mode_from_tensors(
+        example_inputs
+    ) or torch._subclasses.FakeTensorMode(allow_non_fake_inputs=True)
+
+    with V.set_fake_mode(fake_mode):
+        compile_fx.pattern_matcher.fx_passes(gm)
+        V.debug.fx_graph_transformed(gm, example_inputs)
+
+        graph = compile_fx.GraphLowering(
+            gm,
+            shape_env=shape_env,
+            num_static_inputs=num_fixed,
+            graph_id=graph_id,
+        )
+        with V.set_graph_handler(graph):
+            graph.run(*example_inputs)
+            code = graph.codegen()
+            fake_callable.set_code(code)
 
     return fake_callable
 

From f75d9501cc267c73dfc803cf57d0c9bfdd1fd10e Mon Sep 17 00:00:00 2001
From: Chi Zhang <czhangseu@gmail.com>
Date: Mon, 6 Feb 2023 23:01:37 +0800
Subject: [PATCH 20/21] fix linux compilation bug

---
 python/matx/toolchain.py                           | 14 +++++++-------
 .../matx/torch_compiler/codegen/matx_formatter.py  |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/python/matx/toolchain.py b/python/matx/toolchain.py
index 7c8724fc..42c0885e 100644
--- a/python/matx/toolchain.py
+++ b/python/matx/toolchain.py
@@ -421,20 +421,20 @@ def inductor(compiling_obj, example_inputs, *, share=True, toolchain=None, bundl
     result: context.ScriptContext = from_source(compiling_obj, example_inputs)
 
     from torch._inductor import codecache
-    ipaths, lpaths, libs, macros = codecache.get_include_and_linking_paths(include_pytorch=False)
+    ipaths, lpaths, libs, macros = codecache.get_include_and_linking_paths(include_pytorch=False,
+                                                                           vec_isa=codecache.pick_vec_isa())
 
     # TODO: check whether the following flags are handled by common flags
     # codecache.get_shared()
-    # codecache.optimization_flags()
+    optimization_flag = codecache.optimization_flags()
     # codecache.cpp_flags()
     # codecache.get_warning_all_flag()
     # codecache.use_custom_generated_macros()
 
-    torch_compiler_options = ipaths.split() + lpaths.split() + libs.split() + macros.split()
-
-    # TODO: fix this on macOS m1.
-    if '-lomp' in torch_compiler_options:
-        torch_compiler_options.remove('-lomp')
+    torch_compiler_options = []
+    flag_str_lst = [ipaths, lpaths, libs, macros, optimization_flag]
+    for flag_str in flag_str_lst:
+        torch_compiler_options.extend(flag_str.split())
 
     build_dso(result, toolchain is not None, compile_options=torch_compiler_options,
               make_path_prefix=path_prefix_inductor)
diff --git a/python/matx/torch_compiler/codegen/matx_formatter.py b/python/matx/torch_compiler/codegen/matx_formatter.py
index f583c705..c30fb2ea 100644
--- a/python/matx/torch_compiler/codegen/matx_formatter.py
+++ b/python/matx/torch_compiler/codegen/matx_formatter.py
@@ -283,7 +283,7 @@ def matx_cpp_code_format(code: str, kernel_name: str,
 
     include_code_str, kernel_code_str = split_include_kernel(code)
     # add matx include
-    include_code_str = MATX_INCLUDE
+    include_code_str += MATX_INCLUDE
 
     # extract kernel declaration
     kernel_declaration_str, kernel_body_str = split_declaration_body(kernel_code_str)

From e5115e7ff0bbf053e08ddd1d16db0df3c6c10998 Mon Sep 17 00:00:00 2001
From: Chi Zhang <czhangseu@gmail.com>
Date: Mon, 6 Feb 2023 23:05:52 +0800
Subject: [PATCH 21/21] fix py codestyle

---
 python/matx/toolchain.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/matx/toolchain.py b/python/matx/toolchain.py
index 42c0885e..b4185c06 100644
--- a/python/matx/toolchain.py
+++ b/python/matx/toolchain.py
@@ -421,8 +421,8 @@ def inductor(compiling_obj, example_inputs, *, share=True, toolchain=None, bundl
     result: context.ScriptContext = from_source(compiling_obj, example_inputs)
 
     from torch._inductor import codecache
-    ipaths, lpaths, libs, macros = codecache.get_include_and_linking_paths(include_pytorch=False,
-                                                                           vec_isa=codecache.pick_vec_isa())
+    ipaths, lpaths, libs, macros = codecache.get_include_and_linking_paths(
+        include_pytorch=False, vec_isa=codecache.pick_vec_isa())
 
     # TODO: check whether the following flags are handled by common flags
     # codecache.get_shared()