forked from yoann-dufresne/YARC
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathreads_generator.py
More file actions
executable file
·233 lines (182 loc) · 8.64 KB
/
reads_generator.py
File metadata and controls
executable file
·233 lines (182 loc) · 8.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#!/usr/bin/env python3
import sys
import argparse
import csv
import random
from math import ceil
from Bio import SeqIO
from Bio.Seq import Seq
from metropolis_hasting import MetropolisHasting1D
def parse_arguments():
# Parse the command line
parser = argparse.ArgumentParser(description="Create reads from a fasta file reagarding a length distribution and a coverage.")
parser.add_argument('--length_distribution', '-l', required=True, help="CSV file containing a read length distribution. The columns X and Y must be present where X is the read length and Y the number of reads with this length.")
parser.add_argument('--depth', '-d', type=int, required=True, help="Sequencing depth. Without subsampling, this sequencing depth is perfect (ie each nucleotide of the ref is present exactly d time in the reads).")
parser.add_argument('--sub-sample', '-s', type=float, default=1.0, help="Sub-sample ratio. Depending of this ratio, a bigger sequencing depth will be computed. From this bigger pool of reads, the software randomly sub sample to generate an average depth d.")
parser.add_argument('--outprefix', '-o', help="The file prefix where the reads will be saved. If not specified, outputed on stdout (and stderr for paired end)")
parser.add_argument('--reference', '-r', help="The fasta file containing the reference used to generate reads")
parser.add_argument("--paired", "-p", action='store_true', help="If set, generate a pair of read files instead of one. See the gap argument for details on gap size.")
parser.add_argument("--gap_distribution", '-g', help="The gap distribution file used for paired end reads. If not set, the gap will be 0.")
parser.add_argument('--circular', '-c', action='store_true', help="Assume that all the input sequences are circular chromosomes or plasmids.")
parser.add_argument('--seed', type=int, help="Seed value to initialize the random generator. Used for reproducibility.")
args = parser.parse_args()
if args.sub_sample <= 0:
print("sub sample ratio must be positive and non-null", file=sys.stderr)
exit(1)
elif args.sub_sample > 1:
print("sub sample ratio can't be over 1", file=sys.stderr)
exit(1)
return args
def parse_csv(filename):
""" Parse the CSV file for a distribution and return a list of tuples (x, y).
Args:
filename: The CSV filename
Return:
A list of tuples (x, y) regarding the columns X and Y in the csv file.
"""
tuples = []
with open(filename, newline='') as csv_file:
reader = csv.DictReader(csv_file, delimiter=',')
for row in reader:
t = (float(row["X"]), float(row["Y"]))
if t[1] != 0:
tuples.append(t)
return tuples
def init_random_generators(len_distrib, seed=None):
""" Init random generator regarding the distribution of read lengths
Args:
len_distrib: A CSV filename containing the distributions for the length.
In this CSV file, the columns X and Y must be present.
Return:
A namespace containing three Metropolis hasting generators regarding the three different
distributions.
"""
if seed:
random.seed(seed)
# Generate the Metropolis hasting for insertions
distrib = parse_csv(len_distrib)
return MetropolisHasting1D(distrib, 10000, seed=random.randint(0, 1000000000))
def generate(filename, len_generator, depth, paired=False, circular=False, subsample_ratio=1, outprefix=None):
""" Read the reference fasta file and write the out_file with reads. The lengths are
randomized using the length generator.
Args:
filename: The input reference FASTA filename. The sequences will be splited into reads.
If None, stdin is used instead.
len_generator: A random generator for read length.
depth: the output sequencing depth (ie, the average coverage on each nucleotide from the
reference when the outputed reads are aligned)
circular: Boolean to assume if the input sequences are
out_file: Filename for the outputed fasta. All the modified reads will be outputed in this
file. The filename is optional. If the filename is not specified, then the outfile will
be stdout
Return: None
"""
# print("TODO: Fix problem with floating point depth", file=sys.stderr)
full_depth = depth / subsample_ratio
# Get the input name
reader = None
if not filename:
reader = sys.stdin
else:
reader = open(filename)
writer = None
writer2 = None
if not paired:
# Get the outfile name
if not outprefix:
writer = sys.stdout
else:
writer = open(outprefix + ".fasta", "w")
else:
if not outprefix:
writer = sys.stdout
writer2 = sys.stderr
else:
writer = open(outprefix + '_R1.fasta', 'w')
writer2 = open(outprefix + '_R2.fasta', 'w')
# split sequences
random_lengths = []
read_idx = 0
for record in SeqIO.parse(reader, "fasta"):
str_seq = str(record.seq)
# Depth
for d in range(ceil(full_depth)):
# Circular to linear part
split_idx = 0
if circular:
# select the split idx
split_idx = random.randint(0, len(str_seq))
# reorganize the genome
str_seq = "{}{}".format(str_seq[split_idx:], str_seq[:split_idx])
# current idx on reference for next split
left_idx = 0
right_idx = len(str_seq)
# Make reads from left and right
while left_idx < right_idx:
# Length computation
if len(random_lengths) == 0:
random_lengths = len_generator.next_values(10000)
random.shuffle(random_lengths)
length, _ = random_lengths.pop(0)
length = round(length)
total_length = length
if paired:
total_length = total_length * 2 #TODO : + gap
# From left or right ?
left = (random.random() <= 0.5)
origin = split_idx
if left:
sequence = str_seq[left_idx:min(right_idx+1, left_idx +total_length)]
origin += left_idx
left_idx += total_length
else:
sequence = str_seq[max(left_idx, right_idx-total_length+1):right_idx+1]
origin += max(left_idx, right_idx-total_length+1)
right_idx -= total_length
origin %= len(str_seq)
# Length problem
if len(sequence) < total_length:
continue
# sub-sampling
if random.random() > subsample_ratio:
continue
# Floating point coverage
if d == 0 and random.random() > (full_depth%1):
continue
# Reverse complement with 50% chance
reversed = False
if random.random() <= 0.5:
sequence = str(Seq(sequence).reverse_complement())
if paired:
# Extract the beginning of the paired sequence
seq_1 = sequence[:min(length, len(sequence))]
# Extract the end of the paired sequence
seq_2 = sequence[max(0, len(sequence)-length):]
seq_2 = str(Seq(seq_2).reverse_complement())
# Output the reads
writer.write(">{}_{}_R1;origin={};\n{}\n".format(
record.description, read_idx, origin, seq_1))
# Output the reads
writer2.write(">{}_{}_R2;origin={};\n{}\n".format(
record.description, read_idx, origin, seq_2))
else:
# Output the read
writer.write(">{}_{};origin={};\n{}\n".format(
record.description, read_idx, origin, sequence))
read_idx += 1
# Close the outfile if needed
if not outprefix:
writer.close()
if paired:
writer2.close()
if not filename:
reader.close()
def main():
args = parse_arguments()
rnd_gens = init_random_generators(
args.length_distribution
)
# filename, len_generator, depth, subsample_ratio=1, outprefix=None
generate(args.reference, rnd_gens, args.depth, circular=args.circular, paired=args.paired, subsample_ratio=args.sub_sample, outprefix=args.outprefix)
if __name__ == "__main__":
main()