-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathfileprocessing.py
More file actions
52 lines (37 loc) · 1.51 KB
/
fileprocessing.py
File metadata and controls
52 lines (37 loc) · 1.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import glob
import pandas as pd
import re
import numpy as np
import cooler
import bioframe
def get_files(filedir, ext = 'mcool'):
files = glob.glob(filedir + '/**/*'+ext, recursive=True)
return files
def cools_df(cools, resolution = 100000):
df = pd.DataFrame([np.array(re.split("[\_\./\-]+", cool))[[-8, -5]].tolist() + [cool]
for cool in cools],
columns=['cell_line', 'assembly', 'path'])
c_list = list()
for i in range(len(df)):
try:
cooler_obj = cooler.Cooler(df.iloc[i]['path'] + '::/resolutions/' + str(resolution))
c_list.append(cooler_obj)
except:
c_list.append(float("NaN"))
df['cooler'] = pd.Series(c_list)
return df
def beds_df(beds):
df = pd.DataFrame([ bed.split('/')[5:] + [bed] for bed in beds],
columns=[
'cell_line','assay', 'file_format', 'output_type',
'assembly', 'file_status', 'target', 'biosample_treatment',
'lab', 'replicate', 'file_name', 'file_location'])
df['cell_line'] = df['cell_line'].apply(lambda a: a.replace('-', ''))
return df
def get_genecov(df):
genecov_dict = dict()
for assembly in df.assembly.unique():
bins = df[df.assembly == assembly]['cooler'].iloc[0].bins()[:]
genecov = bioframe.tools.frac_gene_coverage(bins, assembly)
genecov_dict[assembly] = genecov['gene_coverage']
return genecov_dict