diff --git a/padiff/comparison/actions.py b/padiff/comparison/actions.py index c14ef8e..87d0d5a 100644 --- a/padiff/comparison/actions.py +++ b/padiff/comparison/actions.py @@ -40,7 +40,7 @@ def get_action_by_name(self, name): raise ValueError(f"Action '{name}' not registered. Available: {list(self.pool.keys())}") return self.pool[name] - def find_actions(self, report_0, node_0, report_1, node_1, name=None): + def find_actions(self, report_0=None, node_0=None, report_1=None, node_1=None, name=None): if name is not None: return self.get_action_by_name(name) @@ -85,11 +85,16 @@ def __call__(self, file_list_0, file_list_1, cfg): for info_0, info_1 in zip(file_list_0, file_list_1): tensor_0 = load_numpy(info_0["path"]) tensor_1 = load_numpy(info_1["path"]) + + if "transpose" in cfg and cfg["transpose"]: + tensor_1 = np.transpose(tensor_1) + if tensor_0.size == 0 or tensor_1.size == 0: if tensor_0.size != tensor_1.size: raise RuntimeError("size of tensors is not equal") logger.warning("Found nparray.size == 0, compare skipped!") continue + assert_tensor_equal(tensor_0, tensor_1, cfg) @@ -113,9 +118,14 @@ def __call__(self, file_list_0, file_list_1, cfg): for info_0, info_1 in zip(file_list_0[:min_len], file_list_1[:min_len]): tensor_0 = load_numpy(info_0["path"]) tensor_1 = load_numpy(info_1["path"]) + + if cfg["transpose"]: + tensor_1 = np.transpose(tensor_1) + if tensor_0.size == 0 or tensor_1.size == 0: logger.debug("Found empty tensor, compare skipped!") continue + if tensor_0.shape != tensor_1.shape: logger.debug(f"Shape of tensors are not equal: {tensor_0.shape}!={tensor_1.shape}") if tensor_0.size == tensor_1.size: @@ -123,7 +133,22 @@ def __call__(self, file_list_0, file_list_1, cfg): tensor_1 = np.reshape(tensor_1, tensor_0.shape) else: continue + assert_tensor_equal(tensor_0, tensor_1, cfg) num_success += 1 + if min_len != 0 and num_success == 0: raise RuntimeError("All outputs for the layer have different shape!") + + +@global_actions.register("ignore") +class IgnoreAction(Action): + def match(self, report_0, node_0, report_1, node_1): + return True + + @property + def priority(self): + return 100 + + def __call__(self, file_list_0, file_list_1, cfg): + pass diff --git a/padiff/comparison/checker/base.py b/padiff/comparison/checker/base.py index e97d7d2..0086c55 100644 --- a/padiff/comparison/checker/base.py +++ b/padiff/comparison/checker/base.py @@ -16,37 +16,7 @@ import torch from itertools import zip_longest import numpy as np -from ...configs import global_yaml_loader -from ...utils import load_numpy, struct_info_log, assert_tensor_equal, logger - - -def process_each_param(process, node_lists, reports, compare_target, cfg): - for node_0, node_1 in zip_longest(node_lists[0], node_lists[1], fillvalue=None): - if node_0 is None or node_1 is None: - raise RuntimeError("Found model with difference number of sublayers. Check your model.") - for (param_name_0, param_path_0), (param_name_1, param_path_1) in zip( - node_0[compare_target].items(), - node_1[compare_target].items(), - ): - try: - settings = global_yaml_loader.get_weight_settings( - (node_0["name"], node_1["name"]), - (reports[0]["framework"], reports[1]["framework"]), - (param_name_0, param_name_1), - ) - settings.update(cfg) - param_0 = load_numpy(param_path_0) - param_1 = load_numpy(param_path_1) - process([node_0, node_1], [param_name_0, param_name_1], [param_0, param_1], settings) - except Exception as e: - err_str = f"{type(e).__name__ + ': ' + str(e)}\n" - err_str += f"Error occured between:\n" - err_str += f" (base_model): {node_0['route'] + '.' + param_name_0}\n" - err_str += f" (raw_model): {node_1['route'] + '.' + param_name_1}\n\n" - - err_str += struct_info_log(reports, (compare_target, compare_target), compare_target) - - raise RuntimeError(err_str) +from ...utils import assert_tensor_equal, logger def assert_shape(params, settings): diff --git a/padiff/comparison/checker/params.py b/padiff/comparison/checker/params.py index 718a46b..0dad6a5 100644 --- a/padiff/comparison/checker/params.py +++ b/padiff/comparison/checker/params.py @@ -12,10 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os.path as osp +from itertools import zip_longest from ...utils import logger, build_file_name, get_all_valid_path, load_json, traversal_node -from ...configs import parse_cfg -from .base import assert_weight, assert_grad, process_each_param +from ...configs import parse_cfg, global_yaml_loader +from ..actions import get_action def check_params(report_path_0, report_path_1, cfg=None): @@ -30,79 +32,83 @@ def check_params(report_path_0, report_path_1, cfg=None): node_lists = [traversal_node(rep["tree"], []) for rep in reports] logger.info(f"Checking params in {path_0} and {path_1}") - weight_rst = weight_rst and check_target(assert_weight, node_lists, reports, "weights", cfg) - grad_rst = grad_rst and check_target(assert_grad, node_lists, reports, "grads", cfg) + weight_rst = weight_rst and _check_params_impl(node_lists, reports, "weights", cfg) + grad_rst = grad_rst and _check_params_impl(node_lists, reports, "grads", cfg) return weight_rst and grad_rst def check_weights(report_path_0, report_path_1, cfg=None): - cfg = parse_cfg(cfg) - logger.info(f"Check weights cfg: {cfg}") + return _check_param_impl(report_path_0, report_path_1, "weights", cfg) - weight_rst = True - all_ranks_path_0, all_ranks_path_1 = get_all_valid_path(report_path_0, report_path_1) - for path_0, path_1 in zip(all_ranks_path_0, all_ranks_path_1): - reports = [load_json(path_0, "weights.json"), load_json(path_1, "weights.json")] - node_lists = [traversal_node(rep["tree"], []) for rep in reports] - logger.info(f"Checking weights in {path_0} and {path_1}") - weight_rst = weight_rst and check_target(assert_weight, node_lists, reports, "weights", cfg) - return weight_rst +def check_grads(report_path_0, report_path_1, cfg=None): + return _check_param_impl(report_path_0, report_path_1, "grads", cfg) -def check_grads(report_path_0, report_path_1, cfg=None): +def _check_param_impl(report_path_0, report_path_1, compare_target, cfg=None): cfg = parse_cfg(cfg) - logger.info(f"Check grads cfg: {cfg}") + logger.info(f"Check {compare_target} cfg: {cfg}") - grad_rst = True + param_rst = True all_ranks_path_0, all_ranks_path_1 = get_all_valid_path(report_path_0, report_path_1) for path_0, path_1 in zip(all_ranks_path_0, all_ranks_path_1): - reports = [load_json(path_0, "grads.json"), load_json(path_1, "grads.json")] + reports = [load_json(path_0, f"{compare_target}.json"), load_json(path_1, f"{compare_target}.json")] node_lists = [traversal_node(rep["tree"], []) for rep in reports] - logger.info(f"Checking grads in {path_0} and {path_1}") - grad_rst = grad_rst and check_target(assert_grad, node_lists, reports, "grads", cfg) - return grad_rst + logger.info(f"Checking {compare_target} in {path_0} and {path_1}") + param_rst = param_rst and _check_params_impl(node_lists, reports, compare_target, cfg) + return param_rst -def check_target(fn, node_lists, reports, compare_target, cfg): - flag = True +def _check_params_impl(node_lists, reports, compare_target, cfg): + diff_found = False log_name = build_file_name(reports[0], compare_target + "_diff") + if osp.exists(osp.join(logger.log_path, log_name)): + with open(osp.join(logger.log_path, log_name), "w") as f: + pass + + action_name = cfg.get("action_name", None) + act = get_action(name=action_name) + + for node_0, node_1 in zip_longest(node_lists[0], node_lists[1], fillvalue=None): + if node_0 is None or node_1 is None: + raise RuntimeError("Found model with difference number of sublayers. Check your model.") + + for (param_name_0, param_path_0), (param_name_1, param_path_1) in zip( + node_0[compare_target].items(), + node_1[compare_target].items(), + ): + try: + assert ( + param_path_0 is not None and param_path_1 is not None + ), f"{compare_target.capitalize()} for at least one of base or raw model is not found." + + settings = global_yaml_loader.get_weight_settings( + (node_0["name"], node_1["name"]), + (reports[0]["framework"], reports[1]["framework"]), + (param_name_0, param_name_1), + ) + settings.update(cfg) - def checker(nodes, param_names, params, settings): - try: - fn(params, settings) - except Exception as e: - nonlocal flag - flag = False - info = ( - "=" * 25 + "\n" + "{} value is different.\n" - "between base_model: {}\n" - " raw_model: {}\n\n" - "base_model param path:\n {}\n" - "raw_model param path:\n {}\n\n" - "{}\n\n".format( - compare_target, - nodes[0]["repr"], - nodes[1]["repr"], - nodes[0]["route"] + "." + param_names[0], - nodes[1]["route"] + "." + param_names[1], - type(e).__name__ + ": " + str(e), + file_list_0 = [{"path": param_path_0}] + file_list_1 = [{"path": param_path_1}] + + act(file_list_0, file_list_1, settings) + + except Exception as e: + diff_found = True + info = ( + f"=========================\n" + f"FAILED!!! {compare_target.capitalize()} Mismatch:\n" + f" Layer: {node_0['name']}(base) vs {node_1['name']}(raw)\n" + f" Route: {node_0['route']}.{param_name_0}(base) vs {node_1['route']}.{param_name_1}(raw)\n" + f"{e}\n\n" ) - ) - logger.log_file(log_name, "a", info) + logger.log_file(log_name, "a", info) - try: - process_each_param(checker, node_lists, reports, compare_target, cfg) - except Exception as e: - logger.error("=" * 10 + f"Err occurs when compare {compare_target}!!!" + "=" * 10 + "\n" + str(e)) + if diff_found: + logger.error(f"The {compare_target} comparing failed !!! Please check report '{logger.log_path}/{log_name}'.") return False - if flag == False: - logger.info( - f"Diff found when compare {compare_target}, please check report \n {logger.log_path}/{log_name}" - ) - else: - logger.info(f"{compare_target} compared.") - - return flag + logger.info(f"The {compare_target} comparing compared.") + return True diff --git a/padiff/comparison/manual.py b/padiff/comparison/manual.py index 6554df6..de54a4d 100644 --- a/padiff/comparison/manual.py +++ b/padiff/comparison/manual.py @@ -23,53 +23,53 @@ def compare_dumps(dump_path1, dump_path2, cfg=None, diff_phase="both"): try: report_success = check_report(dump_path1, dump_path2, cfg=cfg, diff_phase=diff_phase) if report_success: - logger.info("✅ check_report: SUCCESS !!!") + logger.info("✅ check_report: SUCCESS !!!\n") else: - logger.warning("❌ check_report: FAILED !!!") + logger.error("❌ check_report: FAILED !!!\n") except Exception as e: - logger.error(f"❌ check_report: FAILED with error: {e}") + logger.error(f"❌ check_report: FAILED with error: {e}\n") report_success = False # check grads grads_success = None if os.path.exists(f"{dump_path1}/grads.json") and os.path.exists(f"{dump_path2}/grads.json"): - logger.info("\n🔍 Start comparison grads (check_grads)...") + logger.info("🔍 Start comparison grads (check_grads)...") try: grads_success = check_grads(dump_path1, dump_path2, cfg=cfg) if grads_success: - logger.info("✅ check_grads: SUCCESS !!!") + logger.info("✅ check_grads: SUCCESS !!!\n") else: - logger.warning("❌ check_grads: FAILED !!!") + logger.error("❌ check_grads: FAILED !!!\n") except Exception as e: - logger.error(f"❌ check_grads: FAILED with error: {e}") + logger.error(f"❌ check_grads: FAILED with error: {e}\n") grads_success = False # check weights weights_success = None if os.path.exists(f"{dump_path1}/weights.json") and os.path.exists(f"{dump_path2}/weights.json"): - logger.info("\n🔍 Start comparison weights (check_weights)...") + logger.info("🔍 Start comparison weights (check_weights)...") try: weights_success = check_weights(dump_path1, dump_path2, cfg=cfg) if weights_success: - logger.info("✅ check_weights: SUCCESS !!!") + logger.info("✅ check_weights: SUCCESS !!!\n") else: - logger.warning("❌ check_weights: FAILED !!!") + logger.error("❌ check_weights: FAILED !!!\n") except Exception as e: - logger.error(f"❌ check_weights: FAILED with error: {e}") + logger.error(f"❌ check_weights: FAILED with error: {e}\n") weights_success = False # check params params_success = None if os.path.exists(f"{dump_path1}/params.json") and os.path.exists(f"{dump_path2}/params.json"): - logger.info("\n🔍 Start comparison all parameters (check_params)...") + logger.info("🔍 Start comparison all parameters (check_params)...") try: params_success = check_params(dump_path1, dump_path2, cfg=cfg) if params_success: - logger.info("✅ check_params: SUCCESS !!!") + logger.info("✅ check_params: SUCCESS !!!\n") else: - logger.warning("❌ check_params: FAILED !!!") + logger.error("❌ check_params: FAILED !!!\n") except Exception as e: - logger.error(f"❌ check_params: FAILED with error: {e}") + logger.error(f"❌ check_params: FAILED with error: {e}\n") params_success = False # final result @@ -80,7 +80,7 @@ def compare_dumps(dump_path1, dump_path2, cfg=None, diff_phase="both"): if success: logger.info(f"🎉 final comparison result: SUCCESS !!!") else: - logger.warning(f"❌ final comparison result: FAILED !!!") + logger.error(f"❌ final comparison result: FAILED !!!") return success diff --git a/padiff/utils/log.py b/padiff/utils/log.py index 8645651..2ca457c 100644 --- a/padiff/utils/log.py +++ b/padiff/utils/log.py @@ -107,13 +107,12 @@ def log_file(self, filename, mode, info): def print_report_info(nodes, reports, exc, stage, msg=None): - logger.error("FAILED !!!") - logger.error("DIFF DETAILS:") - logger.error(f" '{stage}' Stage Mismatch") - logger.error(f" Layer: {nodes[0]['name']} vs {nodes[1]['name']}") - logger.error(f" Route: {nodes[0]['route']} vs {nodes[1]['route']} \n") - - logger.error(f"Error({type(exc).__name__}): {str(exc)} \n") + logger.error( + f"FAILED !!! '{stage}' Stage Mismatch! \n" + f" Layer: {nodes[0]['name']} vs {nodes[1]['name']} \n" + f" Route: {nodes[0]['route']} vs {nodes[1]['route']} \n" + f"Error({type(exc).__name__}): {str(exc)} \n" + ) if msg is not None: logger.warning("ADDITIONAL MESSAGE:") diff --git a/padiff/utils/optim.py b/padiff/utils/optim.py new file mode 100644 index 0000000..03c7831 --- /dev/null +++ b/padiff/utils/optim.py @@ -0,0 +1,35 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools + +from ..tools import dump_grads + + +def wrap_optimizer_step(optimizer): + if hasattr(optimizer, "_original_step"): + return + + original_step = optimizer.step + + @functools.wraps(original_step) + def wrapped_step(): + original_step() + + proxy_model = getattr(optimizer, "_padiff_proxy_model", None) + if proxy_model is not None: + dump_grads(proxy_model, proxy_model.dump_path) + + optimizer.step = wrapped_step + optimizer._original_step = original_step