diff --git a/padiff/abstracts/hooks/hook.py b/padiff/abstracts/hooks/hook.py index a94516c..1126dbd 100644 --- a/padiff/abstracts/hooks/hook.py +++ b/padiff/abstracts/hooks/hook.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -80,6 +80,17 @@ def init_weights_hook(model, input): if isinstance(param, (paddle.Tensor, torch.Tensor)): np_array = get_numpy_from_tensor(param) init_weights[name] = np_array + logger.debug(f"Register(init_weights_hook): '{name}'(param)") + + for name, buffer in model.named_buffers(): + if isinstance(buffer, (paddle.Tensor, torch.Tensor)) and name not in init_weights: + try: + np_array = get_numpy_from_tensor(buffer) + init_weights[name] = np_array + logger.debug(f"Register(init_weights_hook): '{name}'(buffer)") + except Exception as e: + logger.warning(f"Skip(init_weights_hook): '{name}'(unitialized buffer): {e}") + report.init_weights = init_weights report.init_weights_saved = True return None diff --git a/padiff/abstracts/marker.py b/padiff/abstracts/marker.py index 7db0eff..19d2b4e 100644 --- a/padiff/abstracts/marker.py +++ b/padiff/abstracts/marker.py @@ -107,10 +107,7 @@ def traversal_for_hook(self): def traversal_for_assign_weight(self): yield self.proxy_model for model in traversal_for_assign_weight(self.proxy_model, self): - if ( - model.model not in self.unassigned_weights_list_recursively - and len(list(model.parameters(recursively=False))) == 0 - ): + if model.model not in self.unassigned_weights_list_recursively and no_avaliable_params(model): continue yield model @@ -182,3 +179,11 @@ def traversal_for_hook(model, marker): def traversal_for_assign_weight(model, marker): yield from traversal_layers_assign_weight(model, marker) + + +def no_avaliable_params(model): + if list(model.named_parameters(recursively=False)): + return False + if list(model.named_buffers(recursively=False)): + return False + return True diff --git a/padiff/abstracts/proxy/model.py b/padiff/abstracts/proxy/model.py index 87d0318..270446e 100644 --- a/padiff/abstracts/proxy/model.py +++ b/padiff/abstracts/proxy/model.py @@ -173,8 +173,6 @@ def update_black_list_with_name(self, class_names, mode="all"): if matched_layers: self.update_black_list(matched_layers, mode) logger.info(f"update blacklist: {len(matched_layers)} added with name(s) {class_names}") - else: - logger.warning(f"update blacklist: No layers matched for {class_names}") def set_layer_map(self, layers): self.marker.set_layer_map(layers) @@ -229,6 +227,9 @@ def parameters(self, recursively): def named_parameters(self, recursively): raise NotImplementedError() + def named_buffers(self, recursively): + raise NotImplementedError() + # child sublayers, do not include self def children(self): raise NotImplementedError() @@ -271,6 +272,10 @@ def named_parameters(self, recursively=True): origin_iter = self.model.named_parameters(include_sublayers=recursively) return deco_iter(origin_iter, ProxyParam.create_from) + def named_buffers(self, recursively=True): + origin_iter = self.model.named_buffers(include_sublayers=recursively) + return deco_iter(origin_iter, ProxyParam.create_from) + def children(self): origin_iter = self.model.children() return deco_iter(origin_iter, ProxyModel.create_from) @@ -337,6 +342,10 @@ def named_parameters(self, recursively=True): origin_iter = self.model.named_parameters(recurse=recursively) return deco_iter(origin_iter, ProxyParam.create_from) + def named_buffers(self, recursively=True): + origin_iter = self.model.named_buffers(recurse=recursively) + return deco_iter(origin_iter, ProxyParam.create_from) + def children(self): origin_iter = self.model.children() return deco_iter(origin_iter, ProxyModel.create_from) diff --git a/padiff/abstracts/proxy/params.py b/padiff/abstracts/proxy/params.py index 3e357e4..89598be 100644 --- a/padiff/abstracts/proxy/params.py +++ b/padiff/abstracts/proxy/params.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ import paddle import torch +from ...utils import get_numpy_from_tensor class ProxyParam: @@ -29,8 +30,10 @@ def create_from(param): return PaddleParam(param) elif isinstance(param, torch.nn.parameter.Parameter): return TorchParam(param) + elif isinstance(param, (torch.Tensor, paddle.Tensor)): + return ProxyTensor(param) else: - raise RuntimeError(f"Can not create ProxyParam from {type(param)}") + logger.error(f"Can not create ProxyParam from {type(param)}") def numpy(self): raise NotImplementedError() @@ -52,15 +55,8 @@ class PaddleParam(ProxyParam): def __init__(self, param): super().__init__(param, "paddle") - def _numpy(self, tensor): - if tensor.dtype == paddle.bfloat16: - np_array = tensor.astype("float32").numpy() - else: - np_array = tensor.numpy() - return np_array - def numpy(self): - return self._numpy(self.param) + return get_numpy_from_tensor(self.param) def set_data(self, np_value): paddle.assign(paddle.to_tensor(np_value, dtype=self.param.dtype), self.param) @@ -70,14 +66,14 @@ def shape(self): def grad(self): if self.param.grad is not None: - return self._numpy(self.param.grad) + return get_numpy_from_tensor(self.param.grad) else: return None def main_grad(self): if hasattr(self.param, "main_grad") and self.param.main_grad is not None: assert self.param.grad is None - return self._numpy(self.param.main_grad) + return get_numpy_from_tensor(self.param.main_grad) else: return None @@ -87,15 +83,8 @@ class TorchParam(ProxyParam): def __init__(self, param): super().__init__(param, "torch") - def _numpy(self, tensor): - if tensor.dtype == torch.bfloat16: - np_array = tensor.cpu().detach().float().numpy() - else: - np_array = tensor.cpu().detach().numpy() - return np_array - def numpy(self): - return self._numpy(self.param.data) + return get_numpy_from_tensor(self.param.data) def set_data(self, np_value): self.param.data = torch.as_tensor(np_value).type(self.param.dtype).to(self.param.device) @@ -105,9 +94,23 @@ def shape(self): def grad(self): if self.param.grad is not None: - return self._numpy(self.param.grad.data) + return get_numpy_from_tensor(self.param.grad.data) else: return None def main_grad(self): return None + + +class ProxyTensor(ProxyParam): + def __init__(self, param): + super().__init__(param, "tensor") + + def numpy(self): + return get_numpy_from_tensor(self.param) + + def grad(self): + return None + + def main_grad(self): + return None diff --git a/padiff/comparison/checker/params.py b/padiff/comparison/checker/params.py index 0dad6a5..d975d7b 100644 --- a/padiff/comparison/checker/params.py +++ b/padiff/comparison/checker/params.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -54,7 +54,6 @@ def _check_param_impl(report_path_0, report_path_1, compare_target, cfg=None): for path_0, path_1 in zip(all_ranks_path_0, all_ranks_path_1): reports = [load_json(path_0, f"{compare_target}.json"), load_json(path_1, f"{compare_target}.json")] node_lists = [traversal_node(rep["tree"], []) for rep in reports] - logger.info(f"Checking {compare_target} in {path_0} and {path_1}") param_rst = param_rst and _check_params_impl(node_lists, reports, compare_target, cfg) return param_rst @@ -101,7 +100,7 @@ def _check_params_impl(node_lists, reports, compare_target, cfg): f"=========================\n" f"FAILED!!! {compare_target.capitalize()} Mismatch:\n" f" Layer: {node_0['name']}(base) vs {node_1['name']}(raw)\n" - f" Route: {node_0['route']}.{param_name_0}(base) vs {node_1['route']}.{param_name_1}(raw)\n" + f" Route.param: {node_0['route']}.{param_name_0}(base) vs {node_1['route']}.{param_name_1}(raw)\n" f"{e}\n\n" ) logger.log_file(log_name, "a", info) diff --git a/padiff/tools/dump.py b/padiff/tools/dump.py index 92dd2ca..b579136 100644 --- a/padiff/tools/dump.py +++ b/padiff/tools/dump.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -142,8 +142,16 @@ def dump_param_with_fn(model, fn, target_models): } if model.model in target_models: # only record sublayers specified by marker param_info["available"] = True + params_found = set() for param_name, param in model.named_parameters(recursively=False): fn(param_name, param, param_info) + params_found.add(param_name) + for buffer_name, buffer in model.named_buffers(recursively=False): + if buffer_name not in params_found: + fn(buffer_name, buffer, param_info) + else: + logger.debug(f"Layer {model.class_name} ({model.route}) is NOT in target_models. Skipping.") + for name, child in model.named_children(): param_info["children"].append(dump_param_with_fn(child, fn, target_models)) return param_info @@ -174,8 +182,6 @@ def _dump(param_name, param, param_info): elif param.grad() is not None: file_name = grad_dumper(param.grad()) param_info["grads"][param_name] = file_name - else: - param_info["grads"][param_name] = None dump_param_prototype(model, _dump, f"{path}/params.json") @@ -201,7 +207,8 @@ def _dump(param_name, param, param_info): grad = param.param._collected_grad grad = get_numpy_from_tensor(grad) if grad is not None else None - param_info["grads"][param_name] = grad_dumper(grad) if grad is not None else None + if grad is not None: + param_info["grads"][param_name] = grad_dumper(grad) dump_param_prototype(model, _dump, f"{path}/grads.json") diff --git a/padiff/tools/load.py b/padiff/tools/load.py index 54ba6de..a152fc7 100644 --- a/padiff/tools/load.py +++ b/padiff/tools/load.py @@ -168,7 +168,7 @@ def load_init_weights_from_dump( param_key = param_name if param_key not in loaded_weights: - logger.info(f"param {param_key}({param_name}) not found, skip it.") + logger.warning(f"param {param_key}({param_name}) not found, skip it.") continue np_value = loaded_weights[param_key] diff --git a/tests/test_auto_diff.py b/tests/test_auto_diff.py index bae2ba2..4974120 100644 --- a/tests/test_auto_diff.py +++ b/tests/test_auto_diff.py @@ -50,7 +50,7 @@ def forward(self, x): return x -class TestOfflineCompare(unittest.TestCase): +class TestAutoDiff(unittest.TestCase): def test_check_success(self): layer = SimpleLayer() layer.eval() diff --git a/tests/test_model_with_buffer.py b/tests/test_model_with_buffer.py new file mode 100644 index 0000000..24b9b28 --- /dev/null +++ b/tests/test_model_with_buffer.py @@ -0,0 +1,112 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest +from padiff import * +import paddle +import torch + + +class LayerWithinitializedBuffer(paddle.nn.Layer): + initialized_buffer: paddle.Tensor + + def __init__(self): + super(LayerWithinitializedBuffer, self).__init__() + self.linear1 = paddle.nn.Linear(100, 100) + self.linear2 = paddle.nn.Linear(100, 10) + self.act = paddle.nn.ReLU() + self.register_buffer("initialized_buffer", paddle.zeros([1])) + + def forward(self, x): + resdual = x + x = self.linear1(x) + x = self.act(x) + x = x + resdual + self.initialized_buffer + x = self.linear2(x) + return x + + +class ModuleWithinitializedBuffer(torch.nn.Module): + initialized_buffer: torch.Tensor + + def __init__(self): + super(ModuleWithinitializedBuffer, self).__init__() + self.linear1 = torch.nn.Linear(100, 100) + self.linear2 = torch.nn.Linear(100, 10) + self.act = torch.nn.ReLU() + self.register_buffer("initialized_buffer", torch.zeros([1])) + + def forward(self, x): + resdual = x + x = self.linear1(x) + x = self.act(x) + x = x + resdual + self.initialized_buffer + x = self.linear2(x) + return x + + +class LayerWithUninitializedBuffer(paddle.nn.Layer): + uninitialized_buffer: paddle.Tensor + + def __init__(self): + super(LayerWithUninitializedBuffer, self).__init__() + self.linear = paddle.nn.Linear(10, 10) + self.register_buffer("uninitialized_buffer", None) + self._first_forward = True + + def forward(self, x): + if self._first_forward: + self.uninitialized_buffer = paddle.zeros_like(x) + self._first_forward = False + return self.linear(x) + self.uninitialized_buffer + + +class ModuleWithUninitializedBuffer(torch.nn.Module): + uninitialized_buffer: torch.Tensor + + def __init__(self): + super(ModuleWithUninitializedBuffer, self).__init__() + self.linear = torch.nn.Linear(10, 10) + self.register_buffer("uninitialized_buffer", torch.empty(0)) + self._first_forward = True + + def forward(self, x): + if self._first_forward: + self.uninitialized_buffer = torch.zeros_like(x) + self._first_forward = False + return self.linear(x) + self.uninitialized_buffer + + +class TestModelWithBuffer(unittest.TestCase): + def test_initialized_buffer(self): + layer = create_model(LayerWithinitializedBuffer()) + module = create_model(ModuleWithinitializedBuffer()) + + inp = paddle.rand((1, 100)).numpy().astype("float32") + inp = ({"x": torch.as_tensor(inp)}, {"x": paddle.to_tensor(inp)}) + assert auto_diff(module, layer, inp, atol=1e-4) is True, "Failed. expected success." + + def test_uninitialized_buffer(self): + layer = create_model(LayerWithUninitializedBuffer()) + module = create_model(ModuleWithUninitializedBuffer()) + + inp = paddle.rand((1, 10)).numpy().astype("float32") + inp = ({"x": torch.as_tensor(inp)}, {"x": paddle.to_tensor(inp)}) + + assert auto_diff(module, layer, inp, atol=1e-4) is True, "Failed. expected success." + + +if __name__ == "__main__": + unittest.main()