-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathmakeMetadata.py
More file actions
188 lines (172 loc) · 9.36 KB
/
makeMetadata.py
File metadata and controls
188 lines (172 loc) · 9.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
from datetime import datetime
import pandas as pd
import numpy as np
import os.path
import glob
import warnings
ENV_COLS = {'DateTime': 'UTC date time', 'Latitude': 'latitude', 'Longitude': 'longitude',
'Salinity': 'salinity', 'Temperature': 'temperature_intake'}
LOG_COLS = {'bin': 'IFCB_bin_id', 'Depth': 'depth', 'Type': 'source', 'Source': 'source_id',
'Reference': 'reference', 'Epoch': 'stn', 'Cast': 'cast', 'EpochDay': 'epoch_day', 'Flag': 'flag'}
META_DEFAULTS = {'Type': 'inline', 'Depth': 5, 'Campaign': 2, 'Concentration': 1}
def read_env(filenames, keys=ENV_COLS, read_csv_kwargs=None):
"""
Read Environmental Data typically consisting of GPS and TSG measurements.
It concatenates data from multiple files into a single pandas dataframe/
"""
if type(filenames) == str:
filenames = [filenames]
if read_csv_kwargs is None:
read_csv_kwargs = {}
for k in ['DateTime', 'Latitude', 'Longitude']:
if k not in keys:
raise ValueError(f'Environmental data missing key: {k}')
ikeys = {v: k for k, v in keys.items()} # Remap dictionary
env = list()
for f in filenames:
try:
df = pd.read_csv(f, **read_csv_kwargs)
except UnicodeDecodeError:
print(f'Codec Error reading {f}')
continue
df.rename(columns=ikeys, inplace=True)
df.drop(columns=[c for c in df.columns if c not in keys.keys()], inplace=True)
env.append(df)
return pd.concat(env)
def read_log(filename, sheet_name='Sheet1', keys=LOG_COLS):
"""
Read IFCB log book entered manually to differentiate specific samples (e.g. CTD, Experiments)
Required fields: bin
"""
for k in ['bin']:
if k not in keys:
raise ValueError(f'IFCB Log missing key: {k}')
ikeys = {v: k for k, v in keys.items()} # Remap dictionary
df = pd.read_excel(filename, sheet_name=sheet_name)
df.rename(columns=ikeys, inplace=True)
df.set_index('bin', inplace=True)
df.drop(columns=[c for c in df.columns if c not in keys.keys()], inplace=True)
df.dropna(how='all', inplace=True)
return df
def read_events(filenames, index='name'):
"""
Prepare long events such as Stations Epochs for make_metadata
Filenames is a dictionary of keys: corresponding to a column of meta and value a filename
Event files must contain columns: name, start, and end
"""
d = {}
for k, f in filenames.items():
d[k] = pd.read_csv(f, parse_dates=['start', 'end']).set_index(index)
return d
def make_metadata(path_to_raw, env, log, events={}, defaults=META_DEFAULTS):
"""
Make metadata file containing environmental (GPS + TSG) and
sample identification (CTD, Experiments) for every IFCB bin.
Bin marked as flagged are moved into raw/ignored
"""
path_to_ignored = os.path.join(path_to_raw, 'ignored')
# List all samples
bins = [os.path.splitext(os.path.basename(f))[0] for f in sorted(glob.glob(os.path.join(path_to_raw, '*.roi')))]
# Ignore samples flagged with delete
if 'Flag' in log.columns:
for b in log.index[log.Flag == 'delete']:
if b in bins:
if not os.path.exists(path_to_ignored):
os.mkdir(path_to_ignored)
if os.path.exists(os.path.join(path_to_raw, f'{b}.roi')):
os.rename(os.path.join(path_to_raw, f'{b}.roi'), os.path.join(path_to_ignored, f'{b}.roi'))
if os.path.exists(os.path.join(path_to_raw, f'{b}.adc')):
os.rename(os.path.join(path_to_raw, f'{b}.adc'), os.path.join(path_to_ignored, f'{b}.adc'))
if os.path.exists(os.path.join(path_to_raw, f'{b}.hdr')):
os.rename(os.path.join(path_to_raw, f'{b}.hdr'), os.path.join(path_to_ignored, f'{b}.hdr'))
bins.pop(bins.index(b))
log = log[log.Flag != 'delete']
# Interpolate Env parameters to all samples
seen = set()
keys = [x for x in ['bin', *list(env.keys()), *list(log.keys())] if not (x in seen or seen.add(x))]
meta = {c: [] for c in keys}
meta['bin'] = bins
ts = []
for b in bins:
meta['DateTime'].append(datetime.strptime(b[:-8], 'D%Y%m%dT%H%M%S'))
ts.append(meta['DateTime'][-1].timestamp())
for k in env.keys():
if k == 'DateTime':
continue
meta[k] = np.interp(ts, env.DateTime.to_numpy(dtype=np.int64) / 10**9, env[k], left=np.nan, right=np.nan)
# Set Default Parameters (if field absent from log or env it's added)
for k, v in defaults.items():
meta[k] = [v] * len(bins)
# Set Remaining Fields to nan
for k in [k for k in log.keys() if k not in ['bin', *list(env.keys()), *list(defaults.keys())]]:
meta[k] = [np.nan] * len(bins)
meta = pd.DataFrame(meta).set_index('bin')
# Set Events
for event_key, event_list in events.items():
for name, e in event_list.iterrows():
sel = (e.start <= meta.DateTime) & (meta.DateTime < e.end)
meta.loc[sel, event_key] = name
# Append log data to selected samples
with warnings.catch_warnings(): # This task fragments the DataFrame in memory but is still really fast to run
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)
for b, r in log.iterrows():
i = meta.index[b == meta.index]
if i.empty:
print(f'Raw bin missing or invalid log bin: {b}')
continue
i = i[0]
# Empty environmental data if Depth field is not empty likely incorrect except dt, lat, and lon
if ('Depth' in r.keys() and not r.isnull()['Depth']) and \
('Depth' in defaults.keys() and r['Depth'] > defaults['Depth']):
for kk in env.keys():
if kk not in ['DateTime', 'Latitude', 'Longitude']:
meta.loc[i, kk] = np.nan
for k, missing in r.isnull().items():
if not missing:
meta.loc[i, k] = r[k]
meta = meta.copy() # Necessary as data is highly fragmented due to insertion of data at specific locations
return meta
if __name__ == '__main__':
# %% EXPORTS NA
# root = '/Users/nils/Data/EXPORTS2/'
# env = read_env(sorted(glob.glob(os.path.join(root, 'TSG', '*.csv'))))
# log = read_log(os.path.join(root, 'IFCB107', 'IFCB_log_EXPORTS02.xlsx'))
# events = read_events({'Epoch': os.path.join(root, 'IFCB107', 'EXPORTS2.epochs.csv')})
# meta = make_metadata(os.path.join(root, 'IFCB107', 'raw'), env, log, events)
# meta.to_csv(os.path.join(root, 'IFCB107', 'EXPORTS2.metadata.csv'))
# %% Tara MicroBiome
# root = '/Users/nils/Data/Tara/Microbiome/'
# env_cols = {'DateTime': 'dt', 'Latitude': 'lat', 'Longitude': 'lon',
# 'Salinity': 'sss', 'Temperature': 'sst'}
# env = read_env(os.path.join(root, 'TaraChile&TaraMicrobiome_InLine_TSG_prod.csv'), keys=env_cols)
# log_cols = {'bin': 'bin', 'Flag': 'flag', 'Type': 'source'}
# log = read_log(os.path.join(root, 'IFCB107', 'IFCB107.TaraMicrobiome.log.xlsx'), keys=log_cols)
# events = read_events({'Leg': os.path.join(root, 'TaraMicrobiome.legs.csv')})
# meta_defaults = {'Type': 'inline', 'Depth': 1.5, 'Campaign': 'Tara Microbiome', 'Concentration': 1}
# meta = make_metadata(os.path.join(root, 'IFCB107', 'raw'), env, log, events, defaults=meta_defaults)
# meta.to_csv(os.path.join(root, 'IFCB107', 'IFCB107.TaraMicrobiome.metadata.csv'))
# %% APERO
# root = '/Users/nils/Data/APERO/'
# env_cols = {'DateTime': 'DateTime', 'Latitude': 'Latitude', 'Longitude': 'Longitude',
# 'Salinity': 'SBE21 Salinite (PSU)', 'Temperature': 'SBE3S Temp eau peak av (deg C).1'}
# read_kwargs = dict(sep='\t', decimal=',', parse_dates={'DateTime': ['Date', 'Heure']}, dayfirst=True, encoding="ISO-8859-1")
# env = read_env(sorted(glob.glob(os.path.join(root, 'CSV', '*.csv'))), env_cols, read_kwargs)
# log_cols = {k: k for k in ['bin', 'DateTime', 'Latitude', 'Longitude', 'Station', 'Cast', 'Depth', 'Niskin', 'Type']}
# log = read_log(os.path.join(root, 'IFCB179', 'IFCB179.APERO.logsheets.xlsx'), keys=log_cols)
# meta_defaults = {'Type': 'inline', 'Depth': 5, 'Campaign': 'APERO', 'Concentration': 1, 'Flag': 0}
# meta = make_metadata(os.path.join(root, 'IFCB179', 'raw'), env, log, defaults=meta_defaults)
# meta.to_csv(os.path.join(root, 'IFCB179', 'IFCB179.APERO.metadata.csv'))
# %% Tara Europa
root = '/Users/nils/Data/Tara/Europa'
env_cols = {'DateTime': 'dt', 'Latitude': 'lat', 'Longitude': 'lon',
'Salinity': 'sss', 'Temperature': 'sst'}
read_kwargs = dict(parse_dates=['dt']) #, date_format='%d-%b-%Y %H:%M:%S')
env = read_env(os.path.join(root, 'TaraEuropa_InLine_TSG_20230403_20231109_Product_v20240131.csv'), env_cols, read_kwargs)
# log_cols = {'bin': 'bin', 'Flag': 'flag', 'Type': 'source'}
# log = read_log(os.path.join(root, 'IFCB107', 'IFCB107.TaraEuropa.log.xlsx'), keys=log_cols)
log = pd.DataFrame({'bin': [], 'Flag': []})
# events = read_events({'Leg': os.path.join(root, 'TaraEuropa.legs.csv')})
events = {}
meta_defaults = {'Type': 'inline', 'Depth': 1.5, 'Campaign': 'Tara Europa', 'Concentration': 1}
meta = make_metadata(os.path.join(root, 'IFCB107', 'raw'), env, log, events, defaults=meta_defaults)
meta.to_csv(os.path.join(root, 'IFCB107', 'IFCB107.TaraEuropa.metadata.csv'))