-
Notifications
You must be signed in to change notification settings - Fork 1
Description
Hi authors,
Thanks for the great work! I’m reading your paper and found the Task Interference Score formulation particularly interesting.
In the paper, you define the change in loss for task i caused by an update along task j’s gradient direction as follows:
Equation (2):
Equation (3):
However, when simplifying these equations, I found that:
Which essentially reduces the Task Interference Score to cosine similarity between the two gradients.
This seems to contradict the visualizations (e.g., heatmaps or bar plots) in the paper, where the values go beyond the range
Additionally, since cosine similarity is symmetric, but this doesn’t always appear to be the case in the figures.
🔍 Question
I was only able to find the implementation snippet for this part in the codebase:
# example snippet
def reset_record_layers(record_layers, check_layers, cmoa=False, n_experts=0):
layers = list(range(32))
# layers = [5, 10, 25, 31]
if cmoa:
# base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.4
for check_layer in check_layers:
for layer_i in layers:
record_layers[f"layers.{layer_i}.{check_layer}"] = {
f"lora_A.default.{n}": [] for n in range(n_experts)
}
record_layers[f"layers.{layer_i}.{check_layer}"].update({
f"lora_B.default.{n}": [] for n in range(n_experts)
})
else:
for check_layer in check_layers:
for layer_i in layers:
record_layers[f"layers.{layer_i}.{check_layer}"] = {
"lora_A.default.weight": [],
"lora_B.default.weight": [],
}
def record_gradient(model, record_layers, is_deepspeed_enabled=False):
for name, p in model.named_parameters():
for layer_name, layer_dict in record_layers.items():
if layer_name in name:
for param_name in layer_dict.keys():
# prefix_param = '.'.join(param_name.split('.')[:-1])
if param_name in name and param_name.split('.')[-1] == name.split('.')[-1]:
if is_deepspeed_enabled:
grad = safe_get_full_grad(p)
if dist.get_rank() == 0:
record_layers[layer_name][param_name].append(
grad.flatten().detach().cpu())
else:
grad = p.grad.flatten().detach().cpu()
record_layers[layer_name][param_name].append(grad)
def record_gradient_alignment(task_grads, record_layers):
"""compute the cosine similarity between gradients of different tasks"""
task_pairs = [(task_id_1, task_id_2) for i, task_id_1 in enumerate(task_grads.keys())
for task_id_2 in list(task_grads.keys())[i+1:]]
for task_id_1, task_id_2 in task_pairs:
for layer_name in task_grads[task_id_1].keys():
for param_name in task_grads[task_id_1][layer_name].keys():
cos_sim = nn.functional.cosine_similarity(task_grads[task_id_1][layer_name][param_name],
task_grads[task_id_2][layer_name][param_name], dim=0)
# if layer_name in record_layers:
record_layers[layer_name][param_name].append(cos_sim.item())
# check_layer = '.'.join(layer_name.split('.')[2:])
# record_layers[f"layers.all.{check_layer}"][param_name].append(cos_sim.item())
for layer_name in record_layers.keys():
for param_name in record_layers[layer_name].keys():
record_layers[layer_name][param_name] = torch.tensor(
record_layers[layer_name][param_name]).mean().item()
def record_gradient_alignment_cmoa(task_grads, record_layers, n_experts):
"""compute the cosine similarity between gradients of different tasks"""
# Precompute the unique pairs of tasks
task_pairs = [(task_id_1, task_id_2) for i, task_id_1 in enumerate(task_grads.keys())
for task_id_2 in list(task_grads.keys())[i+1:]]
check_layers = ["mlp.down_proj", "mlp.up_proj", "mlp.gate_proj",
"self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj"]
for task_id_1, task_id_2 in task_pairs:
for check_layer in check_layers:
A_grads_1, A_grads_2 = [], []
B_grads_1, B_grads_2 = [], []
for layer_i in range(32):
layer_name = f"layers.{layer_i}.{check_layer}"
for n in range(n_experts):
A_param_name = f"lora_A.default.{n}"
A_grads_1.append(
task_grads[task_id_1][layer_name][A_param_name])
A_grads_2.append(
task_grads[task_id_2][layer_name][A_param_name])
B_param_name = f"lora_B.default.{n}"
B_grads_1.append(
task_grads[task_id_1][layer_name][B_param_name])
B_grads_2.append(
task_grads[task_id_2][layer_name][B_param_name])
cos_sim_A = nn.functional.cosine_similarity(torch.stack(A_grads_1),
torch.stack(A_grads_2), dim=1)
cos_sim_B = nn.functional.cosine_similarity(torch.stack(B_grads_1),
torch.stack(B_grads_2), dim=1)
c = 0
for layer_i in range(32):
layer_name = f"layers.{layer_i}.{check_layer}"
for n in range(n_experts):
A_param_name = f"lora_A.default.{n}"
B_param_name = f"lora_B.default.{n}"
record_layers[layer_name][A_param_name].append(
cos_sim_A[c].item())
record_layers[layer_name][B_param_name].append(
cos_sim_B[c].item())
c += 1
for layer_name, params in record_layers.items():
for param_name, values in params.items():
record_layers[layer_name][param_name] = torch.tensor(
values).mean().item()Would you mind elaborating on the exact derivation or implementation logic behind
Specifically, it would be helpful to understand:
-
Whether any additional scaling, normalization, or approximation is applied beyond the theoretical formulation?
-
How exactly
$\Delta_i L_i(x_i)$ is computed in practice?
I'm very interested in understanding this better and would greatly appreciate any clarification you can provide.
Thanks again for your excellent work and contribution to the community! 🙌