DSM/run_eval_pipeline.py at main · Gleghorn-Lab/DSM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python
import argparse
import subprocess
import os
import sys


def run_command(command, description=None):
    """Run a command and check for errors."""
    if description:
        print(f"\n=== {description} ===")
    print(f"Running: {' '.join(command)}")
    try:
        subprocess.run(command, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error executing command: {e}")
        sys.exit(1)


def main():
    parser = argparse.ArgumentParser(description="Run the complete DSM evaluation pipeline")
    parser.add_argument("--token", required=True, help="Hugging Face token")
    parser.add_argument("--data_dir", default="evaluation_data", help="Directory to store output data")
    parser.add_argument("--temp_sweep_divisor", type=int, default=100,
                        help="Step divisor for temperature sweep (higher is faster)")
    parser.add_argument("--skip_tuning", action="store_true", help="Skip parameter tuning steps")
    args = parser.parse_args()

    # Create output directory if it doesn't exist
    os.makedirs(args.data_dir, exist_ok=True)

    # Define file paths
    base_csv = os.path.join(args.data_dir, "generated_sequences.csv")
    ss_csv = os.path.join(args.data_dir, "generated_sequences_ss.csv")
    annotated_csv = os.path.join(args.data_dir, "generated_sequences_ss_ann.csv")
    dist_output = os.path.join(args.data_dir, "distributions")

    # Step 1: Parameter tuning (if not skipped)
    if not args.skip_tuning:
        # Tune temperature
        run_command(
            [
                "python", "-m", "evaluation.unconditional_generation_tuning",
                "--token", args.token,
                "--sweep_temp",
                "--step_divisor", str(args.temp_sweep_divisor)
            ],
            "Temperature parameter tuning"
        )

        # Find best temperature from outputs (assuming it's printed to stdout)
        # In a real implementation, you'd need to parse the output to get the best temperature
        best_temp = 0.85  # Default value, replace with actual parsing

        # Tune step size with best temperature
        run_command(
            [
                "python", "-m", "evaluation.unconditional_generation_tuning",
                "--token", args.token,
                "--sweep_step",
                "--temperature", str(best_temp)
            ],
            "Step size parameter tuning"
        )

    # Step 2: Generate sequences
    run_command(
        [
            "python", "-m", "evaluation.unconditional_generation",
            "--token", args.token,
            "--output_path", base_csv
        ],
        "Generating sequences"
    )

    # Step 3: Predict secondary structures
    run_command(
        [
            "python", "-m", "evaluation.ss_pred",
            "--token", args.token,
            "--input_path", base_csv,
            "--output_path", ss_csv
        ],
        "Predicting secondary structures"
    )

    # Step 4: Annotate comparisons
    run_command(
        [
            "python", "-m", "evaluation.annotate_comparisons",
            "--token", args.token,
            "--input_path", ss_csv,
            "--output_path", annotated_csv
        ],
        "Annotating sequences with protein properties"
    )

    # Step 5: Compare distributions
    run_command(
        [
            "python", "-m", "evaluation.compare_distributions",
            "--input_path", annotated_csv,
            "--output_path", dist_output
        ],
        "Comparing distributions"
    )

    # Step 6: Plot results
    run_command(
        [
            "python", "-m", "evaluation.plot_distribution_comparisons"
        ],
        "Plotting distribution comparisons"
    )

    print("\n=== Evaluation pipeline completed successfully ===")
    print(f"Results stored in: {args.data_dir}")


if __name__ == "__main__":
    main()