Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion padiff/comparison/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def get_action_by_name(self, name):
raise ValueError(f"Action '{name}' not registered. Available: {list(self.pool.keys())}")
return self.pool[name]

def find_actions(self, report_0, node_0, report_1, node_1, name=None):
def find_actions(self, report_0=None, node_0=None, report_1=None, node_1=None, name=None):
if name is not None:
return self.get_action_by_name(name)

Expand Down Expand Up @@ -85,11 +85,16 @@ def __call__(self, file_list_0, file_list_1, cfg):
for info_0, info_1 in zip(file_list_0, file_list_1):
tensor_0 = load_numpy(info_0["path"])
tensor_1 = load_numpy(info_1["path"])

if "transpose" in cfg and cfg["transpose"]:
tensor_1 = np.transpose(tensor_1)

if tensor_0.size == 0 or tensor_1.size == 0:
if tensor_0.size != tensor_1.size:
raise RuntimeError("size of tensors is not equal")
logger.warning("Found nparray.size == 0, compare skipped!")
continue

assert_tensor_equal(tensor_0, tensor_1, cfg)


Expand All @@ -113,17 +118,37 @@ def __call__(self, file_list_0, file_list_1, cfg):
for info_0, info_1 in zip(file_list_0[:min_len], file_list_1[:min_len]):
tensor_0 = load_numpy(info_0["path"])
tensor_1 = load_numpy(info_1["path"])

if cfg["transpose"]:
tensor_1 = np.transpose(tensor_1)

if tensor_0.size == 0 or tensor_1.size == 0:
logger.debug("Found empty tensor, compare skipped!")
continue

if tensor_0.shape != tensor_1.shape:
logger.debug(f"Shape of tensors are not equal: {tensor_0.shape}!={tensor_1.shape}")
if tensor_0.size == tensor_1.size:
logger.debug(f"Try to reshape them to {tensor_0.shape}")
tensor_1 = np.reshape(tensor_1, tensor_0.shape)
else:
continue

assert_tensor_equal(tensor_0, tensor_1, cfg)
num_success += 1

if min_len != 0 and num_success == 0:
raise RuntimeError("All outputs for the layer have different shape!")


@global_actions.register("ignore")
class IgnoreAction(Action):
def match(self, report_0, node_0, report_1, node_1):
return True

@property
def priority(self):
return 100

def __call__(self, file_list_0, file_list_1, cfg):
pass
32 changes: 1 addition & 31 deletions padiff/comparison/checker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,37 +16,7 @@
import torch
from itertools import zip_longest
import numpy as np
from ...configs import global_yaml_loader
from ...utils import load_numpy, struct_info_log, assert_tensor_equal, logger


def process_each_param(process, node_lists, reports, compare_target, cfg):
for node_0, node_1 in zip_longest(node_lists[0], node_lists[1], fillvalue=None):
if node_0 is None or node_1 is None:
raise RuntimeError("Found model with difference number of sublayers. Check your model.")
for (param_name_0, param_path_0), (param_name_1, param_path_1) in zip(
node_0[compare_target].items(),
node_1[compare_target].items(),
):
try:
settings = global_yaml_loader.get_weight_settings(
(node_0["name"], node_1["name"]),
(reports[0]["framework"], reports[1]["framework"]),
(param_name_0, param_name_1),
)
settings.update(cfg)
param_0 = load_numpy(param_path_0)
param_1 = load_numpy(param_path_1)
process([node_0, node_1], [param_name_0, param_name_1], [param_0, param_1], settings)
except Exception as e:
err_str = f"{type(e).__name__ + ': ' + str(e)}\n"
err_str += f"Error occured between:\n"
err_str += f" (base_model): {node_0['route'] + '.' + param_name_0}\n"
err_str += f" (raw_model): {node_1['route'] + '.' + param_name_1}\n\n"

err_str += struct_info_log(reports, (compare_target, compare_target), compare_target)

raise RuntimeError(err_str)
from ...utils import assert_tensor_equal, logger


def assert_shape(params, settings):
Expand Down
118 changes: 62 additions & 56 deletions padiff/comparison/checker/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import os.path as osp
from itertools import zip_longest

from ...utils import logger, build_file_name, get_all_valid_path, load_json, traversal_node
from ...configs import parse_cfg
from .base import assert_weight, assert_grad, process_each_param
from ...configs import parse_cfg, global_yaml_loader
from ..actions import get_action


def check_params(report_path_0, report_path_1, cfg=None):
Expand All @@ -30,79 +32,83 @@ def check_params(report_path_0, report_path_1, cfg=None):
node_lists = [traversal_node(rep["tree"], []) for rep in reports]

logger.info(f"Checking params in {path_0} and {path_1}")
weight_rst = weight_rst and check_target(assert_weight, node_lists, reports, "weights", cfg)
grad_rst = grad_rst and check_target(assert_grad, node_lists, reports, "grads", cfg)
weight_rst = weight_rst and _check_params_impl(node_lists, reports, "weights", cfg)
grad_rst = grad_rst and _check_params_impl(node_lists, reports, "grads", cfg)
return weight_rst and grad_rst


def check_weights(report_path_0, report_path_1, cfg=None):
cfg = parse_cfg(cfg)
logger.info(f"Check weights cfg: {cfg}")
return _check_param_impl(report_path_0, report_path_1, "weights", cfg)

weight_rst = True
all_ranks_path_0, all_ranks_path_1 = get_all_valid_path(report_path_0, report_path_1)
for path_0, path_1 in zip(all_ranks_path_0, all_ranks_path_1):
reports = [load_json(path_0, "weights.json"), load_json(path_1, "weights.json")]
node_lists = [traversal_node(rep["tree"], []) for rep in reports]

logger.info(f"Checking weights in {path_0} and {path_1}")
weight_rst = weight_rst and check_target(assert_weight, node_lists, reports, "weights", cfg)
return weight_rst
def check_grads(report_path_0, report_path_1, cfg=None):
return _check_param_impl(report_path_0, report_path_1, "grads", cfg)


def check_grads(report_path_0, report_path_1, cfg=None):
def _check_param_impl(report_path_0, report_path_1, compare_target, cfg=None):
cfg = parse_cfg(cfg)
logger.info(f"Check grads cfg: {cfg}")
logger.info(f"Check {compare_target} cfg: {cfg}")

grad_rst = True
param_rst = True
all_ranks_path_0, all_ranks_path_1 = get_all_valid_path(report_path_0, report_path_1)
for path_0, path_1 in zip(all_ranks_path_0, all_ranks_path_1):
reports = [load_json(path_0, "grads.json"), load_json(path_1, "grads.json")]
reports = [load_json(path_0, f"{compare_target}.json"), load_json(path_1, f"{compare_target}.json")]
node_lists = [traversal_node(rep["tree"], []) for rep in reports]

logger.info(f"Checking grads in {path_0} and {path_1}")
grad_rst = grad_rst and check_target(assert_grad, node_lists, reports, "grads", cfg)
return grad_rst
logger.info(f"Checking {compare_target} in {path_0} and {path_1}")
param_rst = param_rst and _check_params_impl(node_lists, reports, compare_target, cfg)
return param_rst


def check_target(fn, node_lists, reports, compare_target, cfg):
flag = True
def _check_params_impl(node_lists, reports, compare_target, cfg):
diff_found = False
log_name = build_file_name(reports[0], compare_target + "_diff")
if osp.exists(osp.join(logger.log_path, log_name)):
with open(osp.join(logger.log_path, log_name), "w") as f:
pass

action_name = cfg.get("action_name", None)
act = get_action(name=action_name)

for node_0, node_1 in zip_longest(node_lists[0], node_lists[1], fillvalue=None):
if node_0 is None or node_1 is None:
raise RuntimeError("Found model with difference number of sublayers. Check your model.")

for (param_name_0, param_path_0), (param_name_1, param_path_1) in zip(
node_0[compare_target].items(),
node_1[compare_target].items(),
):
try:
assert (
param_path_0 is not None and param_path_1 is not None
), f"{compare_target.capitalize()} for at least one of base or raw model is not found."

settings = global_yaml_loader.get_weight_settings(
(node_0["name"], node_1["name"]),
(reports[0]["framework"], reports[1]["framework"]),
(param_name_0, param_name_1),
)
settings.update(cfg)

def checker(nodes, param_names, params, settings):
try:
fn(params, settings)
except Exception as e:
nonlocal flag
flag = False
info = (
"=" * 25 + "\n" + "{} value is different.\n"
"between base_model: {}\n"
" raw_model: {}\n\n"
"base_model param path:\n {}\n"
"raw_model param path:\n {}\n\n"
"{}\n\n".format(
compare_target,
nodes[0]["repr"],
nodes[1]["repr"],
nodes[0]["route"] + "." + param_names[0],
nodes[1]["route"] + "." + param_names[1],
type(e).__name__ + ": " + str(e),
file_list_0 = [{"path": param_path_0}]
file_list_1 = [{"path": param_path_1}]

act(file_list_0, file_list_1, settings)

except Exception as e:
diff_found = True
info = (
f"=========================\n"
f"FAILED!!! {compare_target.capitalize()} Mismatch:\n"
f" Layer: {node_0['name']}(base) vs {node_1['name']}(raw)\n"
f" Route: {node_0['route']}.{param_name_0}(base) vs {node_1['route']}.{param_name_1}(raw)\n"
f"{e}\n\n"
)
)
logger.log_file(log_name, "a", info)
logger.log_file(log_name, "a", info)

try:
process_each_param(checker, node_lists, reports, compare_target, cfg)
except Exception as e:
logger.error("=" * 10 + f"Err occurs when compare {compare_target}!!!" + "=" * 10 + "\n" + str(e))
if diff_found:
logger.error(f"The {compare_target} comparing failed !!! Please check report '{logger.log_path}/{log_name}'.")
return False

if flag == False:
logger.info(
f"Diff found when compare {compare_target}, please check report \n {logger.log_path}/{log_name}"
)
else:
logger.info(f"{compare_target} compared.")

return flag
logger.info(f"The {compare_target} comparing compared.")
return True
32 changes: 16 additions & 16 deletions padiff/comparison/manual.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,53 +23,53 @@ def compare_dumps(dump_path1, dump_path2, cfg=None, diff_phase="both"):
try:
report_success = check_report(dump_path1, dump_path2, cfg=cfg, diff_phase=diff_phase)
if report_success:
logger.info("✅ check_report: SUCCESS !!!")
logger.info("✅ check_report: SUCCESS !!!\n")
else:
logger.warning("❌ check_report: FAILED !!!")
logger.error("❌ check_report: FAILED !!!\n")
except Exception as e:
logger.error(f"❌ check_report: FAILED with error: {e}")
logger.error(f"❌ check_report: FAILED with error: {e}\n")
report_success = False

# check grads
grads_success = None
if os.path.exists(f"{dump_path1}/grads.json") and os.path.exists(f"{dump_path2}/grads.json"):
logger.info("\n🔍 Start comparison grads (check_grads)...")
logger.info("🔍 Start comparison grads (check_grads)...")
try:
grads_success = check_grads(dump_path1, dump_path2, cfg=cfg)
if grads_success:
logger.info("✅ check_grads: SUCCESS !!!")
logger.info("✅ check_grads: SUCCESS !!!\n")
else:
logger.warning("❌ check_grads: FAILED !!!")
logger.error("❌ check_grads: FAILED !!!\n")
except Exception as e:
logger.error(f"❌ check_grads: FAILED with error: {e}")
logger.error(f"❌ check_grads: FAILED with error: {e}\n")
grads_success = False

# check weights
weights_success = None
if os.path.exists(f"{dump_path1}/weights.json") and os.path.exists(f"{dump_path2}/weights.json"):
logger.info("\n🔍 Start comparison weights (check_weights)...")
logger.info("🔍 Start comparison weights (check_weights)...")
try:
weights_success = check_weights(dump_path1, dump_path2, cfg=cfg)
if weights_success:
logger.info("✅ check_weights: SUCCESS !!!")
logger.info("✅ check_weights: SUCCESS !!!\n")
else:
logger.warning("❌ check_weights: FAILED !!!")
logger.error("❌ check_weights: FAILED !!!\n")
except Exception as e:
logger.error(f"❌ check_weights: FAILED with error: {e}")
logger.error(f"❌ check_weights: FAILED with error: {e}\n")
weights_success = False

# check params
params_success = None
if os.path.exists(f"{dump_path1}/params.json") and os.path.exists(f"{dump_path2}/params.json"):
logger.info("\n🔍 Start comparison all parameters (check_params)...")
logger.info("🔍 Start comparison all parameters (check_params)...")
try:
params_success = check_params(dump_path1, dump_path2, cfg=cfg)
if params_success:
logger.info("✅ check_params: SUCCESS !!!")
logger.info("✅ check_params: SUCCESS !!!\n")
else:
logger.warning("❌ check_params: FAILED !!!")
logger.error("❌ check_params: FAILED !!!\n")
except Exception as e:
logger.error(f"❌ check_params: FAILED with error: {e}")
logger.error(f"❌ check_params: FAILED with error: {e}\n")
params_success = False

# final result
Expand All @@ -80,7 +80,7 @@ def compare_dumps(dump_path1, dump_path2, cfg=None, diff_phase="both"):
if success:
logger.info(f"🎉 final comparison result: SUCCESS !!!")
else:
logger.warning(f"❌ final comparison result: FAILED !!!")
logger.error(f"❌ final comparison result: FAILED !!!")
return success


Expand Down
13 changes: 6 additions & 7 deletions padiff/utils/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,13 +107,12 @@ def log_file(self, filename, mode, info):

def print_report_info(nodes, reports, exc, stage, msg=None):

logger.error("FAILED !!!")
logger.error("DIFF DETAILS:")
logger.error(f" '{stage}' Stage Mismatch")
logger.error(f" Layer: {nodes[0]['name']} vs {nodes[1]['name']}")
logger.error(f" Route: {nodes[0]['route']} vs {nodes[1]['route']} \n")

logger.error(f"Error({type(exc).__name__}): {str(exc)} \n")
logger.error(
f"FAILED !!! '{stage}' Stage Mismatch! \n"
f" Layer: {nodes[0]['name']} vs {nodes[1]['name']} \n"
f" Route: {nodes[0]['route']} vs {nodes[1]['route']} \n"
f"Error({type(exc).__name__}): {str(exc)} \n"
)

if msg is not None:
logger.warning("ADDITIONAL MESSAGE:")
Expand Down
35 changes: 35 additions & 0 deletions padiff/utils/optim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import functools

from ..tools import dump_grads


def wrap_optimizer_step(optimizer):
if hasattr(optimizer, "_original_step"):
return

original_step = optimizer.step

@functools.wraps(original_step)
def wrapped_step():
original_step()

proxy_model = getattr(optimizer, "_padiff_proxy_model", None)
if proxy_model is not None:
dump_grads(proxy_model, proxy_model.dump_path)

optimizer.step = wrapped_step
optimizer._original_step = original_step
Loading