From 71ef540369d121e425cdbd0b487c39dda3adc57c Mon Sep 17 00:00:00 2001 From: lijialin03 Date: Fri, 5 Sep 2025 06:38:34 +0000 Subject: [PATCH 1/2] fix:fix bugs and optimize log --- padiff/abstracts/hooks/base.py | 2 +- padiff/abstracts/hooks/guard.py | 19 +++++---- padiff/abstracts/hooks/hook.py | 76 +++++++++++++++++++-------------- padiff/comparison/actions.py | 2 +- padiff/comparison/manual.py | 69 +++++++++++++++++------------- padiff/utils/log.py | 20 ++++++++- requirements.txt | 1 + 7 files changed, 114 insertions(+), 75 deletions(-) diff --git a/padiff/abstracts/hooks/base.py b/padiff/abstracts/hooks/base.py index a352c17..1cdad72 100644 --- a/padiff/abstracts/hooks/base.py +++ b/padiff/abstracts/hooks/base.py @@ -143,7 +143,7 @@ def find_base_report_node(net_id, step_idx): raise RuntimeError(f"Cannot find net_id={net_id} in base report.") node_list = _context.base[net_id] - if step_idx < 0 or step_idx >= len(node_list): + if step_idx >= len(node_list): raise RuntimeError(f"Index out of range: net_id={net_id}, step_idx={step_idx}, list length={len(node_list)}") return _context.base[net_id][step_idx] diff --git a/padiff/abstracts/hooks/guard.py b/padiff/abstracts/hooks/guard.py index f97c230..91e3709 100644 --- a/padiff/abstracts/hooks/guard.py +++ b/padiff/abstracts/hooks/guard.py @@ -280,15 +280,16 @@ def PaDiffGuard( yield model - except _CallsComplete: - # dump - proxy_model.dump_report(proxy_model.dump_path) - proxy_model.dump_weights(proxy_model.dump_path) - if optimizer is None: - proxy_model.dump_grads(proxy_model.dump_path) - - sys.exit(0) - except SystemExit as e: logger.info("PaDiffGuard: SystemExit received, skipping dump_report.") raise + + finally: + try: + proxy_model.dump_report(proxy_model.dump_path) + proxy_model.dump_weights(proxy_model.dump_path) + if optimizer is None: + proxy_model.dump_grads(proxy_model.dump_path) + except Exception as e: + logger.error(f"Failed to dump: {e}") + sys.exit(0) diff --git a/padiff/abstracts/hooks/hook.py b/padiff/abstracts/hooks/hook.py index 1b7af6c..a94516c 100644 --- a/padiff/abstracts/hooks/hook.py +++ b/padiff/abstracts/hooks/hook.py @@ -167,38 +167,7 @@ def info_hook(model, input, output, net_id): if single_step_state() == "forward" and net_id != -1: # two report_item with same id, the step_idx should be corresponded step_idx = len(list(filter(lambda x: x.type == "forward" and x.net_id == net_id, report.items))) - 1 - - try: - base_report_node = find_base_report_node(net_id, step_idx) - except (IndexError, RuntimeError) as e: - error_msg = str(e) - base_max_calls = "unknown" - if "list length=" in error_msg: - try: - base_max_calls = int(error_msg.split("list length=")[1].split()[0]) - except: - pass - current_calls = step_idx + 1 - route = getattr(model, "route", "unknown") - logger.error( - f"\n ❌ Single-step alignment FAILED: Execution path mismatch!" - f"\n 📌 Layer '{route}' called {current_calls} times (current) vs {base_max_calls} times (base)." - f"\n 📌 Check the forward logic in both models around this layer." - ) - sys.exit(1) - - if base_report_node["name"] != _model.__class__.__name__: - warning_msg = ( - f"\n ⚠️ Single-step alignment FAILED: Layer with net_id={net_id} mismatch!" - f"\n 📌 Mismatch Layer: {base_report_node['name']}(base) vs {_model.__class__.__name__}(raw)" - f"\n 💡 Suggestion: Models have different architectures or initialization order. " - "Please check the model implementation or decrease 'align_depth' to reduce the alignment " - "granularity, or add layers that do not require alignment to the blacklist." - ) - logger.warning(warning_msg) - else: - logger.debug(f"Single Step: {_model.__class__.__name__}(net_id={net_id})") - + base_report_node = single_step_check(report, net_id, step_idx, _model.__class__.__name__, "forward") retval = map_structure(replace_forward_output(base_report_node), output) __in_info_hook__ = False return retval @@ -297,3 +266,46 @@ def inner(input_): return input_ return inner + + +def single_step_check(report, net_id, step_idx, current_name, node_type, bwd_item=None): + + try: + base_report_node = find_base_report_node(net_id, step_idx) + if base_report_node["name"] != current_name: + warning_msg = ( + f"\n ⚠️ Single-step alignment FAILED: {node_type} with net_id={net_id} mismatch!\n" + f" 📌 Mismatch {node_type.capitalize()}: {base_report_node['name']}(base) vs {current_name}(raw)\n" + f" 💡 Suggestion: Models have different architectures or initialization order. " + "Please check the model implementation or decrease 'align_depth' to reduce the alignment " + "granularity, or add layers that do not require alignment to the blacklist." + ) + logger.warning(warning_msg) + else: + logger.debug(f"Single Step: {current_name}(net_id={net_id})") + + return base_report_node + + except (IndexError, RuntimeError) as e: + error_msg = str(e) + base_max_calls = "unknown" + if "list length=" in error_msg: + try: + base_max_calls = int(error_msg.split("list length=")[1].split()[0]) + except: + pass + current_calls = step_idx + 1 + route = "unknown" + if bwd_item and hasattr(bwd_item.net, "route"): + route = bwd_item.net.route + elif hasattr(report.stack._top().net, "route"): + route = report.stack._top().net.route + + logger.error( + f"\n ❌ Single-step alignment FAILED: Execution path mismatch in {node_type}!" + f"\n 📌 Layer '{route}' called {current_calls} times (current) vs {base_max_calls} times (base)." + f"\n 📌 Check the {node_type} logic in both models around this layer." + ) + sys.exit(1) + + return None diff --git a/padiff/comparison/actions.py b/padiff/comparison/actions.py index 87d0d5a..7afb037 100644 --- a/padiff/comparison/actions.py +++ b/padiff/comparison/actions.py @@ -119,7 +119,7 @@ def __call__(self, file_list_0, file_list_1, cfg): tensor_0 = load_numpy(info_0["path"]) tensor_1 = load_numpy(info_1["path"]) - if cfg["transpose"]: + if "transpose" in cfg and cfg["transpose"]: tensor_1 = np.transpose(tensor_1) if tensor_0.size == 0 or tensor_1.size == 0: diff --git a/padiff/comparison/manual.py b/padiff/comparison/manual.py index de54a4d..96c6de7 100644 --- a/padiff/comparison/manual.py +++ b/padiff/comparison/manual.py @@ -33,44 +33,53 @@ def compare_dumps(dump_path1, dump_path2, cfg=None, diff_phase="both"): # check grads grads_success = None if os.path.exists(f"{dump_path1}/grads.json") and os.path.exists(f"{dump_path2}/grads.json"): - logger.info("🔍 Start comparison grads (check_grads)...") - try: - grads_success = check_grads(dump_path1, dump_path2, cfg=cfg) - if grads_success: - logger.info("✅ check_grads: SUCCESS !!!\n") - else: - logger.error("❌ check_grads: FAILED !!!\n") - except Exception as e: - logger.error(f"❌ check_grads: FAILED with error: {e}\n") - grads_success = False + if len(os.listdir(f"{dump_path1}/grads")) == 0 or len(os.listdir(f"{dump_path2}/grads")) == 0: + logger.warning(f" ⚠️ Grads dir is empty of {dump_path1} or/and {dump_path2}\n") + else: + logger.info("🔍 Start comparison grads (check_grads)...") + try: + grads_success = check_grads(dump_path1, dump_path2, cfg=cfg) + if grads_success: + logger.info("✅ check_grads: SUCCESS !!!\n") + else: + logger.error("❌ check_grads: FAILED !!!\n") + except Exception as e: + logger.error(f"❌ check_grads: FAILED with error: {e}\n") + grads_success = False # check weights weights_success = None if os.path.exists(f"{dump_path1}/weights.json") and os.path.exists(f"{dump_path2}/weights.json"): - logger.info("🔍 Start comparison weights (check_weights)...") - try: - weights_success = check_weights(dump_path1, dump_path2, cfg=cfg) - if weights_success: - logger.info("✅ check_weights: SUCCESS !!!\n") - else: - logger.error("❌ check_weights: FAILED !!!\n") - except Exception as e: - logger.error(f"❌ check_weights: FAILED with error: {e}\n") - weights_success = False + if len(os.listdir(f"{dump_path1}/weights")) == 0 or len(os.listdir(f"{dump_path2}/weights")) == 0: + logger.warning(f" ⚠️ Weights dir is empty of {dump_path1} or/and {dump_path2}\n") + else: + logger.info("🔍 Start comparison weights (check_weights)...") + try: + weights_success = check_weights(dump_path1, dump_path2, cfg=cfg) + if weights_success: + logger.info("✅ check_weights: SUCCESS !!!\n") + else: + logger.error("❌ check_weights: FAILED !!!\n") + except Exception as e: + logger.error(f"❌ check_weights: FAILED with error: {e}\n") + weights_success = False # check params params_success = None if os.path.exists(f"{dump_path1}/params.json") and os.path.exists(f"{dump_path2}/params.json"): - logger.info("🔍 Start comparison all parameters (check_params)...") - try: - params_success = check_params(dump_path1, dump_path2, cfg=cfg) - if params_success: - logger.info("✅ check_params: SUCCESS !!!\n") - else: - logger.error("❌ check_params: FAILED !!!\n") - except Exception as e: - logger.error(f"❌ check_params: FAILED with error: {e}\n") - params_success = False + if len(os.listdir(f"{dump_path1}/params")) == 0 or len(os.listdir(f"{dump_path2}/params")) == 0: + logger.warning(f" ⚠️ Params dir is empty of {dump_path1} or/and {dump_path2}\n") + else: + logger.info("🔍 Start comparison all parameters (check_params)...") + try: + params_success = check_params(dump_path1, dump_path2, cfg=cfg) + if params_success: + logger.info("✅ check_params: SUCCESS !!!\n") + else: + logger.error("❌ check_params: FAILED !!!\n") + except Exception as e: + logger.error(f"❌ check_params: FAILED with error: {e}\n") + params_success = False # final result success = report_success diff --git a/padiff/utils/log.py b/padiff/utils/log.py index 2ca457c..29b7807 100644 --- a/padiff/utils/log.py +++ b/padiff/utils/log.py @@ -15,6 +15,15 @@ import os import shutil import logging +import colorlog + + +log_config = { + "DEBUG": {"level": 10, "color": "cyan"}, + "INFO": {"level": 20, "color": "green"}, + "WARNING": {"level": 30, "color": "yellow"}, + "ERROR": {"level": 40, "color": "red"}, +} class Logger: @@ -23,6 +32,14 @@ def __init__(self): self._is_initialized = False self.log_path = "padiff_log" + for key, conf in log_config.items(): + logging.addLevelName(conf["level"], key) + + self.colored_formatter = colorlog.ColoredFormatter( + "%(log_color)s[AutoDiff] [%(levelname)s]%(reset)s %(message)s", + log_colors={key: conf["color"] for key, conf in log_config.items()}, + ) + def setup(self, log_parent_dir): if self._is_initialized: return @@ -50,8 +67,7 @@ def setup(self, log_parent_dir): file_handler.setFormatter(file_formatter) console_handler = logging.StreamHandler() - console_formatter = logging.Formatter("[AutoDiff] [%(levelname)s] %(message)s") - console_handler.setFormatter(console_formatter) + console_handler.setFormatter(self.colored_formatter) self._logger.addHandler(file_handler) self._logger.addHandler(console_handler) diff --git a/requirements.txt b/requirements.txt index 3716c53..c14b1a6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ graphviz numpy coverage pyyaml +colorlog From 63e23a6016e17e055440420173c64180ed650f5e Mon Sep 17 00:00:00 2001 From: lijialin03 Date: Mon, 8 Sep 2025 09:01:35 +0000 Subject: [PATCH 2/2] fix: fix tests bugs --- Makefile | 10 +- README.md | 12 +- padiff/abstracts/hooks/guard.py | 4 +- padiff/tools/load.py | 50 +++-- padiff/utils/log.py | 15 +- tests/__init__.py | 13 ++ tests/padiff_unittests.py | 78 +++++-- tests/test_api_to_Layer.py | 4 - ...t_offline_compare.py => test_auto_diff.py} | 0 tests/test_check_weight_grad.py | 11 +- tests/test_cli_end_to_end.py | 204 ++++++++++++++++++ tests/test_many_usages.py | 64 ++++-- 12 files changed, 387 insertions(+), 78 deletions(-) create mode 100644 tests/__init__.py rename tests/{test_offline_compare.py => test_auto_diff.py} (100%) create mode 100644 tests/test_cli_end_to_end.py diff --git a/Makefile b/Makefile index feca153..f8c8e89 100644 --- a/Makefile +++ b/Makefile @@ -34,11 +34,17 @@ lint-all: # # # # # # # # # # # # # # # Test Block # # # # # # # # # # # # # # # .PHONY: test -test: unit-test +test: unit-test unit-test-special coverage-report unit-test: @echo "Running unit tests with coverage..." - PYTHONPATH=. coverage run --source=. tests/padiff_unittests.py + PADIFF_SILENT=1 PYTHONPATH="$(shell pwd):$(PYTHONPATH)" coverage run --source=. tests/padiff_unittests.py + +unit-test-special: + @echo "Running test_api_to_Layer.py with PADIFF_API_CHECK=ON" + PADIFF_SILENT=1 PADIFF_API_CHECK=ON PYTHONPATH="$(shell pwd):$(PYTHONPATH)" coverage run --source=. --append tests/test_api_to_Layer.py + +coverage-report: @echo "" @echo "Coverage Report:" coverage report -m diff --git a/README.md b/README.md index 700a1a8..435a0b9 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ **P**addle **A**utomatically **Diff** precision toolkits. -## 最近更新(latest 9.2) +## 最近更新(latest 9.8) ### 使用单行命令对齐(支持前反向对齐) @@ -54,9 +54,15 @@ compare_mode: "mean" action_name: "equal" ``` -### 开启 debug 模式(获取更多 log 信息) +### log 设置 -设置环境变量 `export PADIFF_DEBUG=1`,或使用命令运行 `PADIFF_DEBUG=1 python -m padiff.cli ...` +#### 开启 debug 模式 + +为了获取更多 log 信息,可以设置环境变量 `export PADIFF_LOG_LEVEL=DEBUG`,或使用命令运行 `PADIFF_LOG_LEVEL=DEBUG python -m padiff.cli ...` + +#### 开启静默模式 + +或者为了保持控制台信息简洁,可以设置环境变量 `PADIFF_SILENT=1`,以便仅保存 log 文件,不在控制台输出 log 信息 ## 简介 diff --git a/padiff/abstracts/hooks/guard.py b/padiff/abstracts/hooks/guard.py index 91e3709..2711494 100644 --- a/padiff/abstracts/hooks/guard.py +++ b/padiff/abstracts/hooks/guard.py @@ -235,12 +235,12 @@ def PaDiffGuard( # set max calls calls_context.set_limit(max_calls) - logger.info(f"PaDiffGuard: creating proxy model.") proxy_model = create_model(model, name=name, reset_dir=reset_flag) model._padiff_proxy = proxy_model + logger.debug(f"PaDiffGuard: creating proxy model.") if optimizer is not None and not hasattr(optimizer, "_padiff_proxy_model"): - logger.info(f"PaDiffGuard: wrapping optimizer.step().") + logger.debug(f"PaDiffGuard: wrapping optimizer.step().") optimizer._padiff_proxy_model = proxy_model wrap_optimizer_step(optimizer) diff --git a/padiff/tools/load.py b/padiff/tools/load.py index c60b3c5..54ba6de 100644 --- a/padiff/tools/load.py +++ b/padiff/tools/load.py @@ -47,6 +47,8 @@ def load_first_input_from_dump(report_path, tar_framework): args = [] kwargs = {} + NATIVE_TYPES = (int, float, str, bool, type(None)) + for item in meta_info: file_path = os.path.join(input_dir, item["path"]) key = item.get("key") @@ -62,31 +64,45 @@ def load_first_input_from_dump(report_path, tar_framework): tensor.requires_grad_(True) else: raise ValueError(f"Unsupported framework: {tar_framework}") + value = tensor - if key is None: - args.append(tensor) - else: - kwargs[key] = tensor else: + with open(file_path, "r") as f: + full_item = json.load(f) + if item["type"] == "dict": - reconstructed_dict = {} - for k, v in item["data"].items(): - reconstructed_dict[k] = v - value = reconstructed_dict - elif item["type"] in ["list", "tuple"]: - reconstructed_list = [v for v in item["data"]] - value = tuple(reconstructed_list) if item["type"] == "tuple" else reconstructed_list + value = {k: v for k, v in full_item["data"].items()} + elif item["type"] == "list": + value = [v for v in full_item["data"]] + elif item["type"] == "tuple": + value = tuple(v for v in full_item["data"]) + elif item["type"] == "int": + value = int(full_item["data"]) + elif item["type"] == "float": + value = float(full_item["data"]) + elif item["type"] == "bool": + value = full_item["data"].lower() == "true" + elif item["type"] == "NoneType": + value = None + elif item["type"] == "str": + value = full_item["data"] else: - value = item["data"] + logger.warning(f"Skipping unsupported input type '{item['type']}' for input(key={key}).") + continue + + if key is None: + args.append(value) + else: + kwargs[key] = value - if key is None: - args.append(value) - else: - kwargs[key] = value except Exception as e: - logger.error(f"Error loading metadata file {file_path}: {e}") + logger.error(f"Error loading input(key={key}) in {file_path}: {e}") raise + if not args and not kwargs: + logger.warning("No valid inputs were loaded from the dump.") + return None + return (args, kwargs) diff --git a/padiff/utils/log.py b/padiff/utils/log.py index 29b7807..d47226c 100644 --- a/padiff/utils/log.py +++ b/padiff/utils/log.py @@ -46,13 +46,11 @@ def setup(self, log_parent_dir): self._logger = logging.getLogger("padiff") - debug_flag = os.getenv("PADIFF_DEBUG") + silent_flag = os.getenv("PADIFF_SILENT") log_level_flag = os.getenv("PADIFF_LOG_LEVEL") if log_level_flag and log_level_flag.upper() in ("DEBUG", "INFO", "WARNING", "ERROR"): log_level = getattr(logging, log_level_flag.upper()) - elif debug_flag and debug_flag.strip().lower() in ("1", "true", "on"): - log_level = logging.DEBUG else: log_level = logging.INFO self._logger.setLevel(log_level) @@ -65,14 +63,15 @@ def setup(self, log_parent_dir): file_handler = logging.FileHandler(log_file_path, encoding="utf-8") file_formatter = logging.Formatter("[AutoDiff] [%(levelname)s] %(message)s") file_handler.setFormatter(file_formatter) + self._logger.addHandler(file_handler) - console_handler = logging.StreamHandler() - console_handler.setFormatter(self.colored_formatter) + if not silent_flag or silent_flag.strip().lower() not in ("1", "true", "on"): + console_handler = logging.StreamHandler() + console_handler.setFormatter(self.colored_formatter) + self._logger.addHandler(console_handler) - self._logger.addHandler(file_handler) - self._logger.addHandler(console_handler) + self._logger.info(f"Logging initialized. Log file: {log_file_path}") - self._logger.info(f"Logging initialized. Log file: {log_file_path}") self._is_initialized = True self.log_path = log_parent_dir diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..a9cc79c --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/padiff_unittests.py b/tests/padiff_unittests.py index 9f88d8c..955861e 100644 --- a/tests/padiff_unittests.py +++ b/tests/padiff_unittests.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,21 +12,67 @@ # See the License for the specific language governing permissions and # limitations under the License. + +import unittest import os import sys -import subprocess - - -for root, dirs, files in os.walk("./"): - for fname in files: - if fname.endswith(".py") and fname.startswith("test_"): - fpath = root + "/" + fname - (status, output) = subprocess.getstatusoutput("python " + fpath) - if status != 0: - err_info = f"*** ===================== {fpath} ========================= ***\n" - err_info += f"{output}\n" - print(f"Failed on unittest {fname} with error message \n {err_info}.", end="\n", flush=True) - sys.exit(1) +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +EXCLUDE_FILES = {"test_api_to_Layer"} + + +def discover_and_run_filtered_tests(): + loader = unittest.TestLoader() + all_tests = loader.discover(start_dir=".", pattern="test_*.py") + + filtered_suite = unittest.TestSuite() + + def add_filtered_tests(test): + if isinstance(test, unittest.TestCase): + module = sys.modules.get(test.__class__.__module__) + if module and hasattr(module, "__file__"): + filename = os.path.basename(module.__file__) + filename_without_ext = os.path.splitext(filename)[0] + if filename_without_ext not in EXCLUDE_FILES: + filtered_suite.addTest(test) + else: + print(f"Excluding test from file: {filename}") else: - print(f"Succeed on unittest {fname}.", end="\n", flush=True) - os.system("rm -rf ./tests/padiff_dump ./tests/padiff_log") + filtered_suite.addTest(test) + elif isinstance(test, unittest.TestSuite): + for subtest in test: + add_filtered_tests(subtest) + + for suite in all_tests: + add_filtered_tests(suite) + + if filtered_suite.countTestCases() == 0: + print("No tests to run after filtering.") + return False + + runner = unittest.TextTestRunner(verbosity=0) + result = runner.run(filtered_suite) + + os.system("rm -rf ./tests/padiff_dump ./tests/padiff_log") + + return result.wasSuccessful() + + +def main(): + try: + success = discover_and_run_filtered_tests() + if not success: + sys.exit(1) + except Exception as e: + print(f"An error occurred during test execution: {type(e).__name__}: {str(e)}") + import traceback + + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/tests/test_api_to_Layer.py b/tests/test_api_to_Layer.py index 2e35735..665769a 100644 --- a/tests/test_api_to_Layer.py +++ b/tests/test_api_to_Layer.py @@ -12,9 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - -os.environ["PADIFF_API_CHECK"] = "ON" import unittest @@ -66,4 +63,3 @@ def test_api_to_Layer(self): if __name__ == "__main__": unittest.main() - os.environ["PADIFF_API_CHECK"] = "OFF" diff --git a/tests/test_offline_compare.py b/tests/test_auto_diff.py similarity index 100% rename from tests/test_offline_compare.py rename to tests/test_auto_diff.py diff --git a/tests/test_check_weight_grad.py b/tests/test_check_weight_grad.py index 5226d95..205784f 100644 --- a/tests/test_check_weight_grad.py +++ b/tests/test_check_weight_grad.py @@ -20,9 +20,9 @@ from padiff import * -from padiff.checker import check_grads, check_weights -from padiff.dump_tools import dump_grads, dump_weights -from padiff.interfaces.diff_utils import default_loss +from padiff import check_grads, check_weights +from padiff import dump_grads, dump_weights +from padiff.comparison.auto import default_loss class SimpleLayer(paddle.nn.Layer): @@ -63,18 +63,17 @@ def test_weight_grad_check_0(self): module = create_model(SimpleModule()) inp = paddle.rand((100, 100)).numpy().astype("float32") + inp_modified = inp * 2 assign_weight(layer, module) out = layer(paddle.to_tensor(inp)) loss = default_loss(out, "paddle") layer.backward(loss) - out = module(torch.as_tensor(inp)) + out = module(torch.as_tensor(inp_modified)) loss = default_loss(out, "torch") module.backward(loss) - module.model.zero_grad() - dump_weights(layer, layer.dump_path) dump_weights(module, module.dump_path) diff --git a/tests/test_cli_end_to_end.py b/tests/test_cli_end_to_end.py new file mode 100644 index 0000000..5536191 --- /dev/null +++ b/tests/test_cli_end_to_end.py @@ -0,0 +1,204 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest +import os +import shutil +import tempfile +import sys +from unittest.mock import patch +from io import StringIO +import numpy as np +from padiff.cli import main as padiff_cli_main + + +PADDLE_SCRIPT_TEMPLATE = """ +import paddle +import numpy as np + +class SimpleLayer(paddle.nn.Layer): + def __init__(self): + super(SimpleLayer, self).__init__() + self.linear1 = paddle.nn.Linear({input_dim}, {hidden_dim}) + self.linear2 = paddle.nn.Linear({hidden_dim}, {output_dim}) + self.act = paddle.nn.ReLU() + + def forward(self, x): + residual = x + x = self.linear1(x) + x = self.act(x) + x = x + residual + x = self.linear2(x) + return x + +def main(): + inp = np.load("{input_file}") + inp = paddle.to_tensor(inp) + + model = SimpleLayer() + optimizer = paddle.optimizer.Adam(parameters=model.parameters(), learning_rate=0.1) + + out = model(inp) + loss = paddle.mean(out) + loss.backward() + optimizer.step() + optimizer.clear_grad() + +if __name__ == "__main__": + import os + os.makedirs("{dump_path}", exist_ok=True) + main() +""" + +TORCH_SCRIPT_TEMPLATE = """ +import torch +import numpy as np + +class SimpleModule(torch.nn.Module): + def __init__(self): + super(SimpleModule, self).__init__() + self.linear1 = torch.nn.Linear({input_dim}, {hidden_dim}) + self.linear2 = torch.nn.Linear({hidden_dim}, {output_dim}) + self.act = torch.nn.ReLU() + + def forward(self, x): + residual = x + x = self.linear1(x) + x = self.act(x) + x = x + residual + x = self.linear2(x) + return x + +def main(): + inp = np.load("{input_file}") + inp = torch.as_tensor(inp) + + model = SimpleModule() + optimizer = torch.optim.Adam(model.parameters(), lr=0.1) + + out = model(inp) + loss = torch.mean(out) + loss.backward() + optimizer.step() + optimizer.zero_grad() + +if __name__ == "__main__": + import os + os.makedirs("{dump_path}", exist_ok=True) + main() +""" + + +class TestCliEndToEnd(unittest.TestCase): + def setUp(self): + self.test_dir = tempfile.mkdtemp() + self.addCleanup(shutil.rmtree, self.test_dir) + + self.paddle_script_path = os.path.join(self.test_dir, "paddle_script.py") + self.torch_script_path = os.path.join(self.test_dir, "torch_script.py") + self.input_file = os.path.join(self.test_dir, "input.npy") + self.log_dir = os.path.join(self.test_dir, "padiff_logs") + + os.makedirs(self.log_dir, exist_ok=True) + + inp = np.random.rand(100, 100).astype("float32") + np.save(self.input_file, inp) + assert os.path.exists(self.input_file), f"Input file not created: {self.input_file}" + + torch_script = TORCH_SCRIPT_TEMPLATE.format( + input_dim=100, + hidden_dim=100, + output_dim=10, + input_file=self.input_file, + model_name="model_torch", + dump_path=os.path.join(self.log_dir, "torch"), + ) + + paddle_script = PADDLE_SCRIPT_TEMPLATE.format( + input_dim=100, + hidden_dim=100, + output_dim=10, + input_file=self.input_file, + model_name="model_paddle", + dump_path=os.path.join(self.log_dir, "paddle"), + ) + + with open(self.torch_script_path, "w") as f: + f.write(torch_script) + with open(self.paddle_script_path, "w") as f: + f.write(paddle_script) + + assert os.path.exists(self.torch_script_path), f"torch_script.py not created: {self.torch_script_path}" + assert os.path.exists(self.paddle_script_path), f"paddle_script.py not created: {self.paddle_script_path}" + assert os.path.getsize(self.torch_script_path) > 0, f"torch_script.py is empty: {self.torch_script_path}" + assert os.path.getsize(self.paddle_script_path) > 0, f"paddle_script.py is empty: {self.paddle_script_path}" + + def _run_cli_test(self, extra_args): + test_args = [ + "padiff", + "--pt_cmd", + f"python {self.torch_script_path}", + "--pd_cmd", + f"python {self.paddle_script_path}", + "--pt_model_name", + "model", + "--pd_model_name", + "model", + "--log_dir", + self.log_dir, + ] + test_args.extend(extra_args) + + with patch.object(sys, "argv", test_args): + with patch("sys.stdout", new=StringIO()) as fake_out: + with patch("sys.stderr", new=StringIO()) as fake_err: + try: + padiff_cli_main() + except SystemExit as e: + if e.code != 0: + self.fail(f"CLI failed with exit code {e.code}: {fake_err.getvalue()}") + except Exception as e: + self.fail(f"CLI raised an unexpected exception: {type(e).__name__}: {str(e)}") + + def test_end_to_end_basic(self): + self._run_cli_test([]) + + def test_end_to_end_with_optimizer(self): + self._run_cli_test(["--pt_optim_name", "optimizer", "--pd_optim_name", "optimizer"]) + + def test_end_to_end_with_align_depth(self): + self._run_cli_test(["--align_depth", "0"]) + + def test_end_to_end_with_single_step(self): + self._run_cli_test(["--single_step_mode", "forward"]) + self._run_cli_test(["--single_step_mode", "backward"]) + self._run_cli_test(["--single_step_mode", "both"]) + + def test_end_to_end_with_black_list(self): + self._run_cli_test(["--black_list", "Linear"]) + + def test_end_to_end_with_different_atol_rtol(self): + self._run_cli_test(["--atol", "1e-3", "--rtol", "1e-4"]) + + def test_end_to_end_with_different_compare_mode(self): + self._run_cli_test(["--compare_mode", "strict"]) + self._run_cli_test(["--compare_mode", "abs_mean"]) + + def test_end_to_end_with_different_action(self): + self._run_cli_test(["--action_name", "loose_equal"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_many_usages.py b/tests/test_many_usages.py index 7eac23a..be799c2 100644 --- a/tests/test_many_usages.py +++ b/tests/test_many_usages.py @@ -16,11 +16,14 @@ import unittest import paddle import os +import json from paddle.distributed.fleet.utils import recompute default_path = get_dump_root_path() train_step = 10 +dump_freq = 2 +rank = paddle.distributed.get_rank() if paddle.distributed.is_initialized() else 0 class SimpleLayer(paddle.nn.Layer): @@ -54,13 +57,34 @@ def __len__(self): return self.num_samples +def check_file_integrity(dump_path, step, rank): + step_dump_path = os.path.join(dump_path, f"step_{step}", f"rank_{rank}") + + report_json_path = os.path.join(step_dump_path, "report.json") + params_json_path = os.path.join(step_dump_path, "params.json") + + assert os.path.exists(report_json_path), f"report.json not found: {report_json_path}" + assert os.path.exists(params_json_path), f"params.json not found: {params_json_path}" + + try: + with open(report_json_path, "r") as f: + report_data = json.load(f) + with open(params_json_path, "r") as f: + params_data = json.load(f) + except Exception as e: + raise RuntimeError(f"Failed to load JSON files: {e}") + + assert "tree" in report_data, "Invalid report.json format" + assert "tree" in params_data, "Invalid params.json format" + + class Test0SingleModelRun(unittest.TestCase): # single model run def test_single_model_run(self): print("Test for single model run.") layer = SimpleLayer() set_dump_root_path(os.path.join(default_path, "single_model_run")) - layer = create_model(layer, dump_freq=2) + layer = create_model(layer, dump_freq=dump_freq) inp = paddle.rand((100, 100)).numpy().astype("float32") opt = paddle.optimizer.SGD(learning_rate=1e-3, parameters=layer.model.parameters()) @@ -72,8 +96,8 @@ def test_single_model_run(self): opt.clear_grad() layer.try_dump() - assert check_report(layer.dump_path, layer.dump_path) - assert check_params(layer.dump_path, layer.dump_path) + if i % dump_freq == 0: + check_file_integrity(layer.dump_path, i, rank) class Test1DataloaderRun(unittest.TestCase): @@ -83,12 +107,12 @@ def test_dataloader_run(self): print("Test for real dataloader.") layer = SimpleLayer() set_dump_root_path(os.path.join(default_path, "real_dataLoader")) - layer = create_model(layer, dump_freq=2) + layer = create_model(layer, dump_freq=dump_freq) opt = paddle.optimizer.SGD(learning_rate=1e-3, parameters=layer.model.parameters()) dataset = RandomDataset(train_step) loader = paddle.io.DataLoader(dataset) - for inp in loader(): + for step, inp in enumerate(loader()): out = layer(paddle.to_tensor(inp)) loss = out.mean() layer.backward(loss) @@ -96,8 +120,8 @@ def test_dataloader_run(self): opt.clear_grad() layer.try_dump() - assert check_report(layer.dump_path, layer.dump_path) - assert check_params(layer.dump_path, layer.dump_path) + if step % dump_freq == 0: + check_file_integrity(layer.dump_path, step, rank) class Test2WhiteLayerRun(unittest.TestCase): @@ -108,13 +132,13 @@ def test_white_layer_class_run(self): print("Test for single model run.") layer = SimpleLayer() set_dump_root_path(os.path.join(default_path, "white_layer_class")) - layer = create_model(layer, dump_freq=2) + layer = create_model(layer, dump_freq=dump_freq) layer.update_white_list_with_class(paddle.nn.Linear, mode="all") opt = paddle.optimizer.SGD(learning_rate=1e-3, parameters=layer.model.parameters()) dataset = RandomDataset(train_step) loader = paddle.io.DataLoader(dataset) - for inp in loader(): + for step, inp in enumerate(loader()): out = layer(paddle.to_tensor(inp)) loss = out.mean() layer.backward(loss) @@ -122,8 +146,8 @@ def test_white_layer_class_run(self): opt.clear_grad() layer.try_dump() - assert check_report(layer.dump_path, layer.dump_path) - assert check_params(layer.dump_path, layer.dump_path) + if step % dump_freq == 0: + check_file_integrity(layer.dump_path, step, rank) class Test3GradAccumulationRun(unittest.TestCase): @@ -135,7 +159,7 @@ def test_grad_accumulation_run(self): print("Test for gradient accumulation.") layer = SimpleLayer() set_dump_root_path(os.path.join(default_path, "grad_accumulation")) - layer = create_model(layer, dump_freq=2) + layer = create_model(layer, dump_freq=dump_freq) layer.update_white_list_with_class(paddle.nn.Linear, mode="all") opt = paddle.optimizer.SGD(learning_rate=1e-3, parameters=layer.model.parameters()) @@ -150,8 +174,8 @@ def test_grad_accumulation_run(self): opt.clear_grad() layer.try_dump() - assert check_report(layer.dump_path, layer.dump_path) - assert check_params(layer.dump_path, layer.dump_path) + if step % dump_freq == 0: + check_file_integrity(layer.dump_path, step, rank) class Test4RecomputeRun(unittest.TestCase): @@ -164,7 +188,7 @@ def test_recompute_run(self): print("Test for recompute.") layer = SimpleLayer() set_dump_root_path(os.path.join(default_path, "recompute")) - layer = create_model(layer, dump_freq=2) + layer = create_model(layer, dump_freq=dump_freq) layer.update_white_list_with_class(paddle.nn.Linear, mode="all") opt = paddle.optimizer.SGD(learning_rate=1e-3, parameters=layer.model.parameters()) @@ -181,8 +205,8 @@ def test_recompute_run(self): opt.clear_grad() layer.try_dump() - assert check_report(layer.dump_path, layer.dump_path) - assert check_params(layer.dump_path, layer.dump_path) + if step % dump_freq == 0: + check_file_integrity(layer.dump_path, step, rank) class Test5AMPRun(unittest.TestCase): @@ -197,7 +221,7 @@ def test_amp_run(self): layer = SimpleLayer() layer = paddle.amp.decorate(layer, level="O2") set_dump_root_path(os.path.join(default_path, "amp")) - layer = create_model(layer, dump_freq=2) + layer = create_model(layer, dump_freq=dump_freq) layer.update_white_list_with_class(paddle.nn.Linear, mode="all") opt = paddle.optimizer.SGD(learning_rate=1e-3, parameters=layer.model.parameters()) @@ -215,8 +239,8 @@ def test_amp_run(self): opt.clear_grad() layer.try_dump() - assert check_report(layer.dump_path, layer.dump_path) - assert check_params(layer.dump_path, layer.dump_path) + if step % dump_freq == 0: + check_file_integrity(layer.dump_path, step, rank) if __name__ == "__main__":