-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathbackbone_eval.py
More file actions
125 lines (102 loc) · 4.59 KB
/
backbone_eval.py
File metadata and controls
125 lines (102 loc) · 4.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import concurrent.futures
import csv
import random
import subprocess
import fcntl
import time
from typing import Optional
import timm
import torch
import wandb
# NOTE(rob2u): a simple context manager to lock files
# This avoids the edge case that a task is executed twice
class FileLock:
def __init__(self, file):
self.file = file
def __enter__(self):
fcntl.flock(self.file, fcntl.LOCK_EX)
return self.file
def __exit__(self, exc_type, exc_val, exc_tb):
fcntl.flock(self.file, fcntl.LOCK_UN)
self.file.close()
def read_backbones_todo_csv(file_path: str) -> list[str]:
with open(file_path, newline="") as csvfile:
reader = csv.reader(csvfile)
backbones = [row[0] for row in reader][1:] # Skip header
return backbones
def get_existing_runs() -> list[str]:
api = wandb.Api()
project_path = "gorillas/EVAL2-ALL-CXL-OpenSet"
runs = api.runs(project_path)
run_names = [run.name.split("-")[-1] for run in runs] # 000-eval-<backbone_name> -> <backbone_name>
return run_names
def get_command(backbone_name: str) -> Optional[list[str]]:
try:
model = timm.create_model(backbone_name, pretrained=True)
model = model.eval()
except RuntimeError as e:
print(f"Error loading {backbone_name}: {e}")
return None
data_config = timm.data.resolve_model_data_config(model)
is_normalize_matching = data_config["mean"] == (0.485, 0.456, 0.406) and data_config["std"] == (0.229, 0.224, 0.225)
if not is_normalize_matching:
print(f"Warning: data normalization does not match for {backbone_name}")
print(f"Got: {data_config['mean']} {data_config['std']}")
features_ = model.forward_features(torch.randn(1, 3, data_config["input_size"][1], data_config["input_size"][2]))
embedding_size = model.forward_head(features_, pre_logits=True).shape[1]
input_size = data_config["input_size"][-1]
input_size = min(input_size, 768) # Limit input size to 768
if input_size != data_config["input_size"][-1]:
print(f"Warning: input size limited to 768 for {backbone_name}")
return [
"python",
"train.py",
"--config_path=cfgs/visiontransformer_cxl.yml",
"--only_val=True",
f"--run_name=000-eval-{backbone_name}",
f"--embedding_size={embedding_size}",
f"--data_resize_transform={input_size}",
f"--model_name_or_path=timm_eval/{backbone_name}",
f"--normalization_mean={list(data_config['mean'])}",
f"--normalization_std={list(data_config['std'])}",
]
def run_command(backbone_name: str) -> tuple[Optional[str], Optional[str]]:
with FileLock(open(".lock", "w")):
print(f"Running {backbone_name}")
start_time = time.time()
if backbone_name in get_existing_runs():
return None, None
command = get_command(backbone_name)
if command is None:
return None, None
elapsed_time = time.time() - start_time
if elapsed_time < 15: # NOTE(rob2u): give the process at least 15 seconds to start in order to prevent duplicate wandb runs
time.sleep(15 - int(elapsed_time) + 1)
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print(f"waiting for {command}")
stdout, stderr = process.communicate() # Wait for the process to finish (blocking)
print(f"finished {command}")
return stdout.decode(), stderr.decode()
if __name__ == "__main__":
with open(".lock", "w") as f:
pass
max_processes = 6
backbone_list_all = read_backbones_todo_csv("backbone_names_all.csv")
backbone_list_done = get_existing_runs()
backbone_list_todo = list(set(backbone_list_all) - set(backbone_list_done))
print(f"Backbones to evaluate: {len(backbone_list_todo)}")
# NOTE(rob2u): we permute the list in order to distribute the load more evenly
random.shuffle(backbone_list_todo)
results = []
with concurrent.futures.ProcessPoolExecutor(max_workers=max_processes) as executor:
futures = {executor.submit(run_command, backbone_name): backbone_name for backbone_name in backbone_list_todo}
for future in concurrent.futures.as_completed(futures):
cmd = futures[future]
try:
stdout, stderr = future.result()
if stdout is not None:
results.append((cmd, stdout, stderr))
print(f"{cmd} completed")
# print(f"{cmd} completed with output: {stdout}")
except Exception as exc:
print(f"{cmd} generated an exception: {exc}")