Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
260 changes: 258 additions & 2 deletions README.md
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Очень основательное ридми, круто! Отдельный лайк за секцию Contacts and acknowledgements.

Large diffs are not rendered by default.

112 changes: 112 additions & 0 deletions beginner_bioinf_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
def run_dna_rna_tools(inputs: tuple) -> list or str:
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Тут небольшие проблемки с аннотацией, list or str так не будет работать:). Тут надо брать Any[...]

"""
Produce a list of either transcripts, reverse sequences, complementary sequences or reverse complementary sequences
arguments:
- inputs (tuple): an orbitrary amount of strings where the last one is the name of desired operation, and other strings are sequences
return
- complement_seqs (list): a list of complementary sequences
"""
from modules.dna_rna_tools import check_input, transcribe, reverse, complement, reverse_complement
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Импорты лучше помещать вне функций в начало модуля

if len(inputs) < 2:
raise ValueError('Invalid input: the function requires at least one sequence and an operation name!')
*seqs, operation = inputs
check_input(seqs, operation)
if operation == 'transcribe':
result = transcribe(seqs)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
result = transcribe(seqs)
results = transcribe(seqs)

Так будет понятно что это коллекция:)

elif operation == 'reverse':
result = reverse(seqs)
elif operation == 'complement':
result = complement(seqs)
elif operation == 'reverse_complement':
result = reverse_complement(seqs)
else:
raise ValueError('Invalid input: unknown operation! Check the last argument.')
if len(result) == 1:
result = ''.join(result)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
result = ''.join(result)
result = result[0]

return result


def run_protein_tools(inputs: tuple, options: str = None) -> list or dict:
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def run_protein_tools(inputs: tuple, options: str = None) -> list or dict:
def run_protein_tools(inputs: tuple, operation: str = None) -> list or dict:

"""
Produce a list or dictionary according to the option specified
arguments:
- inputs (tuple): a tuple of inputs
- options (str): option name
"""
from modules.protein_tools import calculate_similarity, count_length, info_amino_acid_percentage, find_pattern, convert_to_gene, recode_3letter_to_1letter, check_protein, string_check, check_input
check_input(inputs, options)
operations = {
'similarity': calculate_similarity,
'length': count_length,
'percentage': info_amino_acid_percentage,
'pattern': find_pattern,
'3letter_name': recode_3letter_to_1letter,
'dna_code': convert_to_gene
}

if options == 'similarity':
result = operations[options](inputs[:-2], inputs[-2], inputs[-1])
return (result)
elif options == 'pattern':
result = operations[options](inputs[1:len(inputs)], inputs[0])
return (result)
elif options == '3letter_name':
result = operations[options](inputs[:-1], inputs[-1])
return (result)
elif options == 'length' or options == 'percentage' or options == 'dna_code':
result = []
for inpt in inputs:
res = operations[options](inpt)
result.append(res)
return (result)
else:
raise ValueError('Incorrect options input, please try again')


def run_fastq_filtration(seqs: dict, gc_bounds: tuple = (0, 100), length_bounds: tuple = (0, 2**32), quality_threshold: int = 0):
"""
Filtrate a fastq dictionary according to parameters specifies
arguments:
- seqs (dict): fastq dictionary
- gc_bounds (tuple): range of GC content
- length_bounds (tuple): range of sequence length
- quality_threshold (int): lower limit of average in-sequence quality
return:
- seqs_filtered (dict): filtered fastq dictionary
"""
from modules.fastq_filtration_tools import check_fastq, check_gc, check_length, check_quality
check_fastq(seqs)
seqs_filtered = {}
if type(gc_bounds) == int:
gc_bounds = tuple([0, gc_bounds])
Comment on lines +80 to +81
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if type(gc_bounds) == int:
gc_bounds = tuple([0, gc_bounds])
if isinstance(gc_bounds, int) or isinstance(gc_bounds, float):
gc_bounds = (0, gc_bounds)

if type(length_bounds) == int:
length_bounds = tuple([0, length_bounds])
Comment on lines +82 to +83
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if type(length_bounds) == int:
length_bounds = tuple([0, length_bounds])
if isinstance(length_bounds, int):
length_bounds = (0, length_bounds)

for seq_name in seqs:
gc_check = False
length_check = False
quality_check = False
gc_check = check_gc(seqs[seq_name][0], gc_bounds)
length_check = check_length(seqs[seq_name][0], length_bounds)
quality_check = check_quality(seqs[seq_name][1], quality_threshold)
if gc_check and length_check and quality_check:
seqs_filtered[seq_name] = seqs[seq_name]
return seqs_filtered
Comment on lines +84 to +93
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Тут в цикле можно было бы интерироваться сразу по всем элементам чтобы их использовать.
Suggested change
for seq_name in seqs:
gc_check = False
length_check = False
quality_check = False
gc_check = check_gc(seqs[seq_name][0], gc_bounds)
length_check = check_length(seqs[seq_name][0], length_bounds)
quality_check = check_quality(seqs[seq_name][1], quality_threshold)
if gc_check and length_check and quality_check:
seqs_filtered[seq_name] = seqs[seq_name]
return seqs_filtered
for name, (seq, qual) in seqs.items():
gc_check = False
length_check = False
quality_check = False
gc_check = check_gc(seq, gc_bounds)
length_check = check_length(seq, length_bounds)
quality_check = check_quality(qual, quality_threshold)
if gc_check and length_check and quality_check:
seqs_filtered[name] = (seq, qual)
return seqs_filtered
  1. В целом не нужно пре-создавать 3 переменные заданные в False. Просто записываем в них результаты проверок



def run_beginner_bioinf_tools(*input_data: str or dict, toolbox: str = None, **kwargs: str) -> str or list or dict:
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ну в целом тут супер-мастер функция уже не особо нужна, тут получается с аргументами совсем переусложенная история. Но то что ты попробовал и сделал её - это безусловно плюс для опыта

"""
Performs various operations on nucleic acid, protein and fastq sequences
arguments:
- input_data (str or dict): data to be processed
- toolbox (str): determines which of the three toolkits is used
return:
- (str or list or dict): processed data
"""
if toolbox == 'dna_rna':
return run_dna_rna_tools(input_data)
elif toolbox == 'proteins':
return run_protein_tools(input_data, kwargs['options'])
elif toolbox == 'fastq':
return run_fastq_filtration(input_data[0], gc_bounds = kwargs['gc_bounds'], length_bounds = kwargs['length_bounds'], quality_threshold = kwargs['quality_threshold'])
else:
raise ValueError('Invalid input: there is no toolbox corresponding to value ' + str(toolbox) + '!')
3 changes: 3 additions & 0 deletions modules/dna_rna_constants.py
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Очень круто что ты вынес константы в отдельные файлы!! Их бы еще капсом назвать, так чисто принято чтобы отличать константы от переменных

Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
dna_rna_alphabet = ['A', 'a', 'T', 't', 'C', 'c', 'G', 'g', 'U', 'u']
compl_dna = {'A': 'T', 'a': 't', 'T': 'A', 't': 'a', 'C': 'G', 'c': 'g', 'G': 'C', 'g': 'c'}
compl_rna = {'A': 'U', 'a': 'u', 'U': 'A', 'u': 'a', 'C': 'G', 'c': 'g', 'G': 'C', 'g': 'c'}
92 changes: 92 additions & 0 deletions modules/dna_rna_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
def check_input(seqs: list, operation: str):
"""
Check input for some of the most common errors
arguments:
- seqs (list): a list of sequences
- operation (str): operation name
return
- no return
"""
from modules.dna_rna_constants import dna_rna_alphabet
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Импорты все таки лучше делать в начале скрипта вне функций. А то так каждый раз при вызове функции будет повторяться импорт

for seq in seqs:
if type(seq) != str:
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Чаще принято все таки делать проверку через if isinstance(seq, str), хотя так тоже работает

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ты же кстати вроде в ДЗ 3 и использовал её уже

raise ValueError('Invalid input: all sequences must be of type str!')
t_present = False
u_present = False
for i in seq:
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Тут все таки лучше не i, это же не счетчик какой-то или индекс, это нуклеотид.

Suggested change
for i in seq:
for nucl in seq:

if i not in dna_rna_alphabet:
raise ValueError('Invalid input: sequences must contain only letters "A", "T", "C", "G" in either upper or lower case!')
if i == 'T' or i == 't':
t_present = True
if i == 'U' or i == 'u':
u_present = True
if t_present and u_present:
Comment on lines +19 to +23
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

В целом валидное решение с двумя флагами. Я лично чуть меньше люблю флаги, потому что за ними надо следить (а код иногда бывает очень большой). Но тут кажется это ок.

raise ValueError('Invalid input: sequences must not include both thymine and uracil!')
if operation == 'transcribe' and u_present:
raise ValueError('Invalid input: cannot transcribe RNA sequence(s).')
Comment on lines +25 to +26
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔥



def transcribe(seqs: list) -> list:
"""
Produce a list of transcripts
arguments:
- seqs (list): a list of sequences
return
- transcripts (list): a list of transcripts
"""
from modules.dna_rna_constants import compl_dna
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Импорты вне функции
  2. В питоне константы еще принято называть капсом, COMPL_DNA

transcripts = []
for seq in seqs:
transcript = []
for i in seq:
transcript.append(compl_dna[i])
transcripts.append(''.join(transcript))
Comment on lines +39 to +43
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for seq in seqs:
transcript = []
for i in seq:
transcript.append(compl_dna[i])
transcripts.append(''.join(transcript))
for seq in seqs:
transcript = ''.join(COMPL_DNA[nucl] for nucl in seq)
transcripts.append(transcript)

return transcripts


def reverse(seqs: list) -> list:
"""
Produce a list of reverse sequences
arguments:
- seqs (list): a list of sequences
return
- reverse_seqs (list): a list of reverse sequences
"""
reverse_seqs = []
for seq in seqs:
reverse_seqs.append(seq[::-1])
return reverse_seqs


def complement(seqs: list) -> list:
"""
Produce a list of complementary sequences
arguments:
- seqs (list): a list of sequences
return
- complement_seqs (list): a list of complementary sequences
"""
from modules.dna_rna_constants import compl_dna, compl_rna
complement_seqs = []
for seq in seqs:
if 'U' in seq or 'u' in seq:
complement_seq = []
for j in seq:
complement_seq.append(compl_rna[j])
else:
complement_seq = []
for j in seq:
complement_seq.append(compl_dna[j])
Comment on lines +72 to +79
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Кажется что от этого условия if-else ничего не поменялось:)

complement_seqs.append(''.join(complement_seq))
return complement_seqs


def reverse_complement(seqs: list) -> list:
"""
Produce a list of reverse complementary sequences
arguments:
- seqs (list): a list of sequences
return
- (list): a list of reverse complementary sequences
"""
return complement(reverse(seqs))
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return complement(reverse(seqs))
return reverse(complement(seqs))

😁

1 change: 1 addition & 0 deletions modules/fastq_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
fastq_dna_code = ['A', 'T', 'G', 'C']
87 changes: 87 additions & 0 deletions modules/fastq_filtration_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
def check_fastq(seqs: dict):
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Очень хорошая глубокая система проверок:)

"""
Check fastq dictionary
arguments:
- seqs (dict): a fastq dictionary
return:
- no return
"""
from modules.fastq_constants import fastq_dna_code
if type(seqs) != dict:
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Через isinstance

raise ValueError('Invalid input: a dict object was expected!')
for seq_name in seqs:
if type(seqs[seq_name][0]) != str:
raise ValueError('Invalid input: sequences must be of type str!')
if len(seqs[seq_name][0]) == 0:
raise ValueError('Invalid input: sequences must be at least one nucleotide long!')
for i in seqs[seq_name][0]:
if i not in fastq_dna_code:
Comment on lines +17 to +18
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Тут лучше было бы сделать проверку на множествах:)

raise ValueError('Invalid input: sequences must contain only letters "A", "T", "G", "C" in upper case!')
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ну в целом lower case тоже по идее может быть:)

if seq_name[0] != '@':
raise ValueError('Invalid input: sequence names are incorrect!')
Comment on lines +20 to +21
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Не знаю насколько это строгое условие для FASTQ, но ок:)



def check_fastq(seqs: dict):
"""
Check fastq dictionary
arguments:
- seqs (dict): a fastq dictionary
return:
- no return
"""
from modules.fastq_constants import fastq_dna_code
if type(seqs) != dict:
raise ValueError('Invalid input: a dict object was expected!')
for seq_name in seqs:
if type(seqs[seq_name][0]) != str:
raise ValueError('Invalid input: sequences must be of type str!')
if len(seqs[seq_name][0]) == 0:
raise ValueError('Invalid input: sequences must be at least one nucleotide long!')
for i in seqs[seq_name][0]:
if i not in fastq_dna_code:
raise ValueError('Invalid input: sequences must contain only letters "A", "T", "G", "C" in upper case!')
if seq_name[0] != '@':
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Мб кстати тут лучше было бы не [0] а seq_name.startswith('@'). По сути одно и то же, но читается имхо сильно понятнее

raise ValueError('Invalid input: sequence names are incorrect!')


def check_gc(seq: str, gc_bounds: tuple or int) -> bool:
"""
Check how GC content of a sequence corresponds to the range provided
arguments:
- seq (str): a sequence
- gc_bounds (tuple or int): the range in which GC content must vary
return:
- (bool): whether GC content of a sequence is in the range provided
"""
gc_sum = 0
for nucleotide in seq:
if nucleotide == 'G' or nucleotide == 'C':
gc_sum += 1
return gc_bounds[0] <= gc_sum/len(seq)*100 <= gc_bounds[1]
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Хех, компактность огонь:)
Мб деление на длину и умножение на 100 стоило бы даже на отдельную строчку вынести, все таки отдельный шаг логики



def check_length(seq: str, length_bounds: tuple or int) -> bool:
"""
Check how length of a sequence corresponds to the range provided
arguments:
- seq (str): a sequence
- length_bounds (tuple or int): the range in which length must vary
return:
- (bool): whether length of a sequence is in the range provided
"""
return length_bounds[0] <= len(seq) <= length_bounds[1]
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔥



def check_quality(quality: str, quality_threshold: int):
"""
Check how average quality of a sequence corresponds to the lower limit provided
arguments:
- quality (str): a sequence describing the quality for each nucleotide in sequence
- quality_threshold (int): the lower limit for average quality
return:
- (bool): whether average quality of a sequence is equal or higher than the lower limit provided
"""
quality_sum = 0
for i in quality:
quality_sum += ord(i) - 33
return quality_sum/len(quality) >= quality_threshold
34 changes: 34 additions & 0 deletions modules/protein_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
retranslation_dict = {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Обычно к названию переменной не очень нужно добавлять ее тип. Мы же не пишем seq_str или

for  i_int in range(0, 10):
    ...

'F': 'TTC', 'f': 'ttc',
'L': 'TTA', 'l': 'tta',
'S': 'TCG', 's': 'tcg',
'Y': 'TAC', 'y': 'tac',
'C': 'TGC', 'c': 'tgc',
'W': 'TGG', 'w': 'tgg',
'P': 'CCC', 'p': 'ccc',
'H': 'CAT', 'h': 'cat',
'Q': 'GAA', 'q': 'gaa',
'R': 'CGA', 'r': 'cga',
'I': 'ATT', 'i': 'att',
'M': 'ATG', 'm': 'atg',
'T': 'ACC', 't': 'acc',
'N': 'AAT', 'n': 'aat',
'K': 'AAA', 'k': 'aaa',
'V': 'GTT', 'v': 'gtt',
'A': 'GCA', 'a': 'gca',
'D': 'GAT', 'd': 'gca',
'E': 'GAG', 'e': 'gag',
'G': 'GGG', 'g': 'ggg'
}

threel = {'A': 'ALA', 'R': 'ARG', 'N': 'ASN', 'D': "ASP", 'V': 'VAL',
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Не совсем понимаю что подразумевается под threel

'H': 'HIS', 'G': "GLY", 'Q': "GLN", 'E': 'GLU', 'I': 'ILE',
'L': 'LEU', 'K': 'LYS', 'M': 'MET', 'P': 'PRO', 'S': 'SER',
'Y': 'TYR', 'T': 'THR', 'W': 'TRP', 'F': 'PHE', 'C': 'CYS',
'a': 'ala', 'r': 'arg', 'n': 'asn', 'd': "asp", 'v': 'val',
'h': 'his', 'g': "gly", 'q': "gln", 'e': 'glu', 'i': 'ile',
'l': 'leu', 'k': 'lys', 'm': 'met', 'p': 'pro', 's': 'ser',
'y': 'tyr', 't': 'thr', 'w': 'trp', 'f': 'phe', 'c': 'cys'
}

aminoacids = ['F', 'f', 'L', 'l', 'S', 's', 'Y', 'y', 'C', 'c', 'W', 'w', 'P', 'p', 'H', 'h', 'Q', 'q', 'R', 'r', 'I', 'i', 'M', 'm', 'T', 't', 'N', 'n', 'K', 'k', 'V', 'v', 'A', 'a', 'D', 'd', 'E', 'e', 'G', 'g']
Loading