msn_tree_library/sequence_creation.py at master · pluskal-lab/msn_tree_library · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
import pandas as pd
from pandas import DataFrame
from datetime import date
from tqdm import tqdm
import logging
import os
from dataclasses import dataclass
import pandas_utils as pu

tqdm.pandas()
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.DEBUG)

base_filename_header = "base_filename"


@dataclass
class InstrumentMethod:
    identifier: str
    path: str


def main():
    metadata_file = (
        r"C:\git\msn_library\data\iocb_libraries\iocb_peptide_library_cleaned.tsv"
    )
    data_filepath = (
        r"D:\Xcalibur\data\Corinna_Brungs"  # storage path for the acquired data
    )

    # define all variables
    inject_volume_mul = 2
    lib_id = "pluskal_iocb_peptide"
    method_suffix = "MSn"  # is added to the end of the path and file names
    instrument_methods = [
        InstrumentMethod(
            "positive",
            r"C:\Xcalibur\methods\Corinna_Brungs\Library6_100AGC_60000Res_MS5_POS_mz115-2000",
        ),
        InstrumentMethod(
            "negative",
            r"C:\Xcalibur\methods\Corinna_Brungs\Library6_100AGC_60000Res_MS5_NEG_mz115-2000",
        ),
        # InstrumentMethod("polarity_switching", r"Test"),
    ]

    # plates are inserted into x compartment
    # if plate is not named in the metadata table by plate_id_header column - leave the plate_id empty
    plates_in_autosampler_location = [
        # plate_id, location
        ("Peptide01", "R"),
    ]

    create_orbitrap_sequence(
        metadata_file,
        data_filepath,
        instrument_methods,
        lib_id,
        method_suffix,
        plates_in_autosampler_location,
        inject_volume_mul=inject_volume_mul,
        blank_well=None,
        qc_well=None,
        blank_qc_autosampler_location="",
        blank_every_n_samples=20,
    )


def create_orbitrap_sequence(
    metadata_file,
    data_filepath: str,
    instrument_methods: list[InstrumentMethod],
    lib_id: str,
    method_suffix: str,
    plates_in_autosampler_location: list,
    unique_id_header="unique_sample_id",
    plate_id_header="plate_id",
    well_header="well_location",
    inject_volume_mul=2,
    blank_well=None,
    qc_well=None,
    blank_qc_autosampler_location=None,
    blank_every_n_samples=1000,
):
    """
    Creates sequences for orbitrap instruments
    :param metadata_file: the metadata file that contains the well_location, plate_id, and unique_sample_id columns
    :param data_filepath: path to store acquired data to
    :param instrument_methods: methods and paths
    :param lib_id: defines the compound library
    :param method_suffix: defines the method, e.g., MSn, IT, HCD, ...
    :param plates_in_autosampler_location: list of tuples plate_id, location as tuples ("15", "B"),
    :param unique_id_header: defines a unique sample id. Must not end or start with a number so that contains matches are unique even for A1 and A10
    :param plate_id_header: plate id column. Plate ids can be any string or number
    :param well_header: defines the column with well locations, e.g., A1
    :param inject_volume_mul: micro liter injection volume
    :return:
    """
    current_date = date.today().strftime("%Y%m%d")
    # NO NEED TO CHANGE ANYTHING BELOW
    # final values
    data_filepath = os.path.join(
        data_filepath, lib_id, f"{current_date}_{method_suffix}"
    )
    dataframes = []
    metadata_df = load_metadata_df(
        metadata_file, well_header, unique_id_header, method_suffix
    )
    for plate_id, plate_location in plates_in_autosampler_location:
        plate_df = filter_metadata_by_plate_id(metadata_df, plate_id, plate_id_header)

        sequence_file = f"data/Sequence/{current_date}_seq_rack_{plate_location}_{lib_id}_{plate_id}_{method_suffix}"

        df = _create_orbitrap_sequence(
            plate_df,
            sequence_file,
            data_filepath,
            well_header,
            plate_location,
            instrument_methods,
            inject_volume_mul,
        )
        dataframes.append(df)
    concat = pd.concat(dataframes)
    plates_str = "_".join(
        [
            "{}in{}".format(plate_id, loc)
            for plate_id, loc in plates_in_autosampler_location
        ]
    )
    sequence_file = f"data/Sequence/{current_date}_{plates_str}_seq_combined.csv"
    # add blanks and qcs
    final_df = add_blank_qc_rows(
        concat,
        data_filepath,
        instrument_methods,
        blank_well,
        qc_well,
        blank_qc_autosampler_location,
        blank_every_n_samples,
    )

    write_thermo_sequence(sequence_file, final_df)


def add_blank_qc_rows(
    df: pd.DataFrame,
    data_filepath,
    instrument_methods,
    blank_well,
    qc_well,
    blank_qc_autosampler_location,
    blank_every_n_samples,
) -> pd.DataFrame:
    if blank_well is None and qc_well is None:
        return df

    # TODO handle blank and qc autosampler location is None and use current plate

    main_method = instrument_methods[0]
    row_blank = {
        "File Name": "Blank",
        "Path": f"{data_filepath}_{main_method.identifier}",
        "Instrument Method": main_method.path,
        "Position": "{}:{}".format(blank_qc_autosampler_location, blank_well),
        "Inj Vol": "1",
        "Dil Factor": 1,
    }
    row_qc = {
        "File Name": "QC",
        "Path": f"{data_filepath}_{main_method.identifier}",
        "Instrument Method": main_method.path,
        "Position": "{}:{}".format(blank_qc_autosampler_location, qc_well),
        "Inj Vol": "1",
        "Dil Factor": 1,
    }
    blank_qc_df = pd.DataFrame([row_blank, row_qc])
    chunks = pu.divide_chunks(df, blank_every_n_samples)
    chunks = [pd.concat([blank_qc_df, chunk]) for chunk in chunks]
    chunks.append(blank_qc_df)
    final_df = pd.concat(chunks)
    return final_df


def _create_orbitrap_sequence(
    metadata_df: DataFrame,
    sequence_file,
    data_filepath,
    well_header,
    plate_location,
    instrument_methods: list[InstrumentMethod],
    inject_volume_mul=3,
) -> DataFrame:
    """
    Creates Orbitrap sequence for positive and negative mode

    :param data_filepath: data acquisition file path. polarity will be added, path needs to be available before acquisition
            (date_suffix_polarity)
    :param sequence_file: the base sequence file to export. polarity and file type csv will be added automatically
    :param metadata_df: a dataframe that is already filtered to only contain samples from a single plate
    :param well_header: getting the well number of the final plate, e.g., A1
    :param instrument_methods: instrument methods
    :param plate_location: position in the autosampler
    Sequence for Plate 2 and Position Green
    :param inject_volume_mul: injection volume in micro liter
    :return:
    """
    if len(instrument_methods) == 0:
        raise ValueError("Provide at least one method file")

    dataframes = []
    for method in instrument_methods:
        polarity, instrument_method = method.identifier, method.path
        if not instrument_method:
            continue

        seq_df = DataFrame()
        seq_df["File Name"] = [
            f"{base_filename}_{polarity}"
            for base_filename in metadata_df[base_filename_header]
        ]
        #
        seq_df["Path"] = f"{data_filepath}_{polarity}"
        seq_df["Instrument Method"] = instrument_method
        seq_df["Position"] = [
            "{}:{}".format(plate_location, well) for well in metadata_df[well_header]
        ]

        seq_df["Inj Vol"] = inject_volume_mul
        seq_df["Dil Factor"] = 1
        seq_df = seq_df.drop_duplicates()

        csv_file = f"{sequence_file}_{polarity}.csv"
        write_thermo_sequence(csv_file, seq_df)
        dataframes.append(seq_df)
    return pd.concat(dataframes)


def load_metadata_df(
    metadata_file, well_header, unique_id_header, method_suffix
) -> DataFrame:
    logging.info("Will run on %s", metadata_file)
    # import df
    if metadata_file.endswith(".tsv"):
        df = pd.read_csv(metadata_file, sep="\t")
    else:
        df = pd.read_csv(metadata_file, sep=",")

    if well_header not in df.columns:
        raise ValueError(
            f"No column named {well_header} with the well number, e.g., A1"
        )
    if unique_id_header not in df.columns:
        raise ValueError(
            f"No column named {unique_id_header} with unique sample ids. Run metadata clean up that generates a unique id, e.g., lib_plate1_A1_id (note the _id at the end and the prefix that make sure that wells like A1 do not match to A10)"
        )

    current_date = date.today().strftime("%Y%m%d")
    df[base_filename_header] = [
        "{}_{}_{}".format(current_date, unique_id, method_suffix)
        for unique_id in df[unique_id_header]
    ]
    return df


def filter_metadata_by_plate_id(
    metadata_df: DataFrame, plate_id: str, plate_id_header: str
) -> DataFrame:
    """
    Filter metadata_df to only contain entries for the current plate_id
    DONT apply filter if plate_id is empty - instead use full metadata_df
    :param metadata_df:
    :param plate_id:
    :param plate_id_header:
    :return:
    """
    if plate_id:
        if plate_id_header not in metadata_df.columns:
            raise ValueError(
                f"Plate id filter was {plate_id} but there was no column in the metadata file for the plates. Name: {plate_id_header}"
            )
        else:
            plate_df = metadata_df[metadata_df[plate_id_header].astype(str) == plate_id]
    else:
        plate_df = metadata_df
    return plate_df


def write_thermo_sequence(csv_file, df: DataFrame):
    df.to_csv(csv_file, index=False)
    # df.to_csv("data/nih/{}_uniqueID.csv".format(lib_id), index=False)
    # Adding the first line as needed by Xcalibur sequence
    with open(csv_file, "r") as original:
        data = original.read()
    with open(csv_file, "w") as modified:
        modified.write("Bracket Type=4,\n" + data)

    logging.info(f"Saved new sequence to: {csv_file}")


if __name__ == "__main__":
    try:
        main()
    except:
        logging.exception("Could not create sequence")
        exit(1)
    exit(0)