ifcb-tools/makeMetadata.py at master · OceanOptics/ifcb-tools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
from datetime import datetime
import pandas as pd
import numpy as np
import os.path
import glob
import warnings


ENV_COLS = {'DateTime': 'UTC date time', 'Latitude': 'latitude', 'Longitude': 'longitude',
            'Salinity': 'salinity', 'Temperature': 'temperature_intake'}
LOG_COLS = {'bin': 'IFCB_bin_id', 'Depth': 'depth', 'Type': 'source', 'Source': 'source_id',
            'Reference': 'reference', 'Epoch': 'stn', 'Cast': 'cast', 'EpochDay': 'epoch_day', 'Flag': 'flag'}
META_DEFAULTS = {'Type': 'inline', 'Depth': 5, 'Campaign': 2, 'Concentration': 1}


def read_env(filenames, keys=ENV_COLS, read_csv_kwargs=None):
    """
    Read Environmental Data typically consisting of GPS and TSG measurements.
    It concatenates data from multiple files into a single pandas dataframe/
    """
    if type(filenames) == str:
        filenames = [filenames]
    if read_csv_kwargs is None:
        read_csv_kwargs = {}
    for k in ['DateTime', 'Latitude', 'Longitude']:
        if k not in keys:
            raise ValueError(f'Environmental data missing key: {k}')
    ikeys = {v: k for k, v in keys.items()}  # Remap dictionary
    env = list()
    for f in filenames:
        try:
            df = pd.read_csv(f, **read_csv_kwargs)
        except UnicodeDecodeError:
            print(f'Codec Error reading {f}')
            continue
        df.rename(columns=ikeys, inplace=True)
        df.drop(columns=[c for c in df.columns if c not in keys.keys()], inplace=True)
        env.append(df)
    return pd.concat(env)


def read_log(filename, sheet_name='Sheet1', keys=LOG_COLS):
    """
    Read IFCB log book entered manually to differentiate specific samples (e.g. CTD, Experiments)
    Required fields: bin
    """
    for k in ['bin']:
        if k not in keys:
            raise ValueError(f'IFCB Log missing key: {k}')
    ikeys = {v: k for k, v in keys.items()}  # Remap dictionary
    df = pd.read_excel(filename, sheet_name=sheet_name)
    df.rename(columns=ikeys, inplace=True)
    df.set_index('bin', inplace=True)
    df.drop(columns=[c for c in df.columns if c not in keys.keys()], inplace=True)
    df.dropna(how='all', inplace=True)
    return df


def read_events(filenames, index='name'):
    """
    Prepare long events such as Stations Epochs for make_metadata
    Filenames is a dictionary of keys: corresponding to a column of meta and value a filename
    Event files must contain columns: name, start, and end
    """
    d = {}
    for k, f in filenames.items():
        d[k] = pd.read_csv(f, parse_dates=['start', 'end']).set_index(index)
    return d


def make_metadata(path_to_raw, env, log, events={}, defaults=META_DEFAULTS):
    """
    Make metadata file containing environmental (GPS + TSG) and
    sample identification (CTD, Experiments) for every IFCB bin.

    Bin marked as flagged are moved into raw/ignored
    """
    path_to_ignored = os.path.join(path_to_raw, 'ignored')
    # List all samples
    bins = [os.path.splitext(os.path.basename(f))[0] for f in sorted(glob.glob(os.path.join(path_to_raw, '*.roi')))]
    # Ignore samples flagged with delete
    if 'Flag' in log.columns:
        for b in log.index[log.Flag == 'delete']:
            if b in bins:
                if not os.path.exists(path_to_ignored):
                    os.mkdir(path_to_ignored)
                if os.path.exists(os.path.join(path_to_raw, f'{b}.roi')):
                    os.rename(os.path.join(path_to_raw, f'{b}.roi'), os.path.join(path_to_ignored, f'{b}.roi'))
                if os.path.exists(os.path.join(path_to_raw, f'{b}.adc')):
                    os.rename(os.path.join(path_to_raw, f'{b}.adc'), os.path.join(path_to_ignored, f'{b}.adc'))
                if os.path.exists(os.path.join(path_to_raw, f'{b}.hdr')):
                    os.rename(os.path.join(path_to_raw, f'{b}.hdr'), os.path.join(path_to_ignored, f'{b}.hdr'))
                bins.pop(bins.index(b))
        log = log[log.Flag != 'delete']
    # Interpolate Env parameters to all samples
    seen = set()
    keys = [x for x in ['bin', *list(env.keys()), *list(log.keys())] if not (x in seen or seen.add(x))]
    meta = {c: [] for c in keys}
    meta['bin'] = bins
    ts = []
    for b in bins:
        meta['DateTime'].append(datetime.strptime(b[:-8], 'D%Y%m%dT%H%M%S'))
        ts.append(meta['DateTime'][-1].timestamp())
    for k in env.keys():
        if k == 'DateTime':
            continue
        meta[k] = np.interp(ts, env.DateTime.to_numpy(dtype=np.int64) / 10**9, env[k], left=np.nan, right=np.nan)
    # Set Default Parameters (if field absent from log or env it's added)
    for k, v in defaults.items():
        meta[k] = [v] * len(bins)
    # Set Remaining Fields to nan
    for k in [k for k in log.keys() if k not in ['bin', *list(env.keys()), *list(defaults.keys())]]:
        meta[k] = [np.nan] * len(bins)
    meta = pd.DataFrame(meta).set_index('bin')
    # Set Events
    for event_key, event_list in events.items():
        for name, e in event_list.iterrows():
            sel = (e.start <= meta.DateTime) & (meta.DateTime < e.end)
            meta.loc[sel, event_key] = name
    # Append log data to selected samples
    with warnings.catch_warnings():  # This task fragments the DataFrame in memory but is still really fast to run
        warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)
        for b, r in log.iterrows():
            i = meta.index[b == meta.index]
            if i.empty:
                print(f'Raw bin missing or invalid log bin: {b}')
                continue
            i = i[0]
            # Empty environmental data if Depth field is not empty likely incorrect except dt, lat, and lon
            if ('Depth' in r.keys() and not r.isnull()['Depth']) and \
                    ('Depth' in defaults.keys() and r['Depth'] > defaults['Depth']):
                for kk in env.keys():
                    if kk not in ['DateTime', 'Latitude', 'Longitude']:
                        meta.loc[i, kk] = np.nan
            for k, missing in r.isnull().items():
                if not missing:
                    meta.loc[i, k] = r[k]
    meta = meta.copy()  # Necessary as data is highly fragmented due to insertion of data at specific locations
    return meta


if __name__ == '__main__':
    # %% EXPORTS NA
    # root = '/Users/nils/Data/EXPORTS2/'
    # env = read_env(sorted(glob.glob(os.path.join(root, 'TSG', '*.csv'))))
    # log = read_log(os.path.join(root, 'IFCB107', 'IFCB_log_EXPORTS02.xlsx'))
    # events = read_events({'Epoch': os.path.join(root, 'IFCB107', 'EXPORTS2.epochs.csv')})
    # meta = make_metadata(os.path.join(root, 'IFCB107', 'raw'), env, log, events)
    # meta.to_csv(os.path.join(root, 'IFCB107', 'EXPORTS2.metadata.csv'))

    # %% Tara MicroBiome
    # root = '/Users/nils/Data/Tara/Microbiome/'
    # env_cols = {'DateTime': 'dt', 'Latitude': 'lat', 'Longitude': 'lon',
    #             'Salinity': 'sss', 'Temperature': 'sst'}
    # env = read_env(os.path.join(root, 'TaraChile&TaraMicrobiome_InLine_TSG_prod.csv'), keys=env_cols)
    # log_cols = {'bin': 'bin', 'Flag': 'flag', 'Type': 'source'}
    # log = read_log(os.path.join(root, 'IFCB107', 'IFCB107.TaraMicrobiome.log.xlsx'), keys=log_cols)
    # events = read_events({'Leg': os.path.join(root, 'TaraMicrobiome.legs.csv')})
    # meta_defaults = {'Type': 'inline', 'Depth': 1.5, 'Campaign': 'Tara Microbiome', 'Concentration': 1}
    # meta = make_metadata(os.path.join(root, 'IFCB107', 'raw'), env, log, events, defaults=meta_defaults)
    # meta.to_csv(os.path.join(root, 'IFCB107', 'IFCB107.TaraMicrobiome.metadata.csv'))

    # %% APERO
    # root = '/Users/nils/Data/APERO/'
    # env_cols = {'DateTime': 'DateTime', 'Latitude': 'Latitude', 'Longitude': 'Longitude',
    #             'Salinity': 'SBE21 Salinite (PSU)', 'Temperature': 'SBE3S Temp eau peak av (deg C).1'}
    # read_kwargs = dict(sep='\t', decimal=',', parse_dates={'DateTime': ['Date', 'Heure']}, dayfirst=True, encoding="ISO-8859-1")
    # env = read_env(sorted(glob.glob(os.path.join(root, 'CSV', '*.csv'))), env_cols, read_kwargs)
    # log_cols = {k: k for k in ['bin', 'DateTime', 'Latitude', 'Longitude', 'Station', 'Cast', 'Depth', 'Niskin', 'Type']}
    # log = read_log(os.path.join(root, 'IFCB179', 'IFCB179.APERO.logsheets.xlsx'), keys=log_cols)
    # meta_defaults = {'Type': 'inline', 'Depth': 5, 'Campaign': 'APERO', 'Concentration': 1, 'Flag': 0}
    # meta = make_metadata(os.path.join(root, 'IFCB179', 'raw'), env, log, defaults=meta_defaults)
    # meta.to_csv(os.path.join(root, 'IFCB179', 'IFCB179.APERO.metadata.csv'))

    # %% Tara Europa
    root = '/Users/nils/Data/Tara/Europa'
    env_cols = {'DateTime': 'dt', 'Latitude': 'lat', 'Longitude': 'lon',
                'Salinity': 'sss', 'Temperature': 'sst'}
    read_kwargs = dict(parse_dates=['dt']) #, date_format='%d-%b-%Y %H:%M:%S')
    env = read_env(os.path.join(root, 'TaraEuropa_InLine_TSG_20230403_20231109_Product_v20240131.csv'), env_cols, read_kwargs)
    # log_cols = {'bin': 'bin', 'Flag': 'flag', 'Type': 'source'}
    # log = read_log(os.path.join(root, 'IFCB107', 'IFCB107.TaraEuropa.log.xlsx'), keys=log_cols)
    log = pd.DataFrame({'bin': [], 'Flag': []})
    # events = read_events({'Leg': os.path.join(root, 'TaraEuropa.legs.csv')})
    events = {}
    meta_defaults = {'Type': 'inline', 'Depth': 1.5, 'Campaign': 'Tara Europa', 'Concentration': 1}
    meta = make_metadata(os.path.join(root, 'IFCB107', 'raw'), env, log, events, defaults=meta_defaults)
    meta.to_csv(os.path.join(root, 'IFCB107', 'IFCB107.TaraEuropa.metadata.csv'))