zmitserbio · zmitserbio · Oct 11, 2023 · Oct 11, 2023 · Oct 11, 2023 · Oct 11, 2023
diff --git a/README.md b/README.md
diff --git a/beginner_bioinf_tools.py b/beginner_bioinf_tools.py
@@ -0,0 +1,112 @@
+def run_dna_rna_tools(inputs: tuple) -> list or str:
+    """
+    Produce a list of either transcripts, reverse sequences, complementary sequences or reverse complementary sequences
+    arguments:
+    - inputs (tuple): an orbitrary amount of strings where the last one is the name of desired operation, and other strings are sequences
+    return
+    - complement_seqs (list): a list of complementary sequences
+    """
+    from modules.dna_rna_tools import check_input, transcribe, reverse, complement, reverse_complement
+    if len(inputs) < 2:
+        raise ValueError('Invalid input: the function requires at least one sequence and an operation name!')
+    *seqs, operation = inputs
+    check_input(seqs, operation)
+    if operation == 'transcribe':
+        result = transcribe(seqs)
-        result = transcribe(seqs)
+        results = transcribe(seqs)
-        result = transcribe(seqs)
+        results = transcribe(seqs)
+    elif operation == 'reverse':
+        result = reverse(seqs)
+    elif operation == 'complement':
+        result = complement(seqs)
+    elif operation == 'reverse_complement':
+        result = reverse_complement(seqs)
+    else:
+        raise ValueError('Invalid input: unknown operation! Check the last argument.')
+    if len(result) == 1:
+        result = ''.join(result)
-        result = ''.join(result)
+        result = result[0]
-        result = ''.join(result)
+        result = result[0]
+    return result
+
+
+def run_protein_tools(inputs: tuple, options: str = None) -> list or dict:
-def run_protein_tools(inputs: tuple, options: str = None) -> list or dict:
+def run_protein_tools(inputs: tuple, operation: str = None) -> list or dict:
-def run_protein_tools(inputs: tuple, options: str = None) -> list or dict:
+def run_protein_tools(inputs: tuple, operation: str = None) -> list or dict:
+    """
+    Produce a list or dictionary according to the option specified
+    arguments:
+    - inputs (tuple): a tuple of inputs
+    - options (str): option name
+    """
+    from modules.protein_tools import calculate_similarity, count_length, info_amino_acid_percentage, find_pattern, convert_to_gene, recode_3letter_to_1letter, check_protein, string_check, check_input
+    check_input(inputs, options)
+    operations = {
+        'similarity': calculate_similarity,
+        'length': count_length,
+        'percentage': info_amino_acid_percentage,
+        'pattern': find_pattern,
+        '3letter_name': recode_3letter_to_1letter,
+        'dna_code': convert_to_gene
+    }
+
+    if options == 'similarity':
+        result = operations[options](inputs[:-2], inputs[-2], inputs[-1])
+        return (result)
+    elif options == 'pattern':
+        result = operations[options](inputs[1:len(inputs)], inputs[0])
+        return (result)
+    elif options == '3letter_name':
+        result = operations[options](inputs[:-1], inputs[-1])
+        return (result)
+    elif options == 'length' or options == 'percentage' or options == 'dna_code':
+        result = []
+        for inpt in inputs:
+            res = operations[options](inpt)
+            result.append(res)
+        return (result)
+    else:
+        raise ValueError('Incorrect options input, please try again')
+
+
+def run_fastq_filtration(seqs: dict, gc_bounds: tuple = (0, 100), length_bounds: tuple = (0, 2**32), quality_threshold: int = 0):
+    """
+    Filtrate a fastq dictionary according to parameters specifies
+    arguments:
+    - seqs (dict): fastq dictionary
+    - gc_bounds (tuple): range of GC content
+    - length_bounds (tuple): range of sequence length
+    - quality_threshold (int): lower limit of average in-sequence quality
+    return:
+    - seqs_filtered (dict): filtered fastq dictionary
+    """
+    from modules.fastq_filtration_tools import check_fastq, check_gc, check_length, check_quality
+    check_fastq(seqs)
+    seqs_filtered = {}
+    if type(gc_bounds) == int:
+        gc_bounds = tuple([0, gc_bounds])
-    if type(gc_bounds) == int:
-        gc_bounds = tuple([0, gc_bounds])
+    if isinstance(gc_bounds, int) or isinstance(gc_bounds, float):
+        gc_bounds = (0, gc_bounds)
-    if type(gc_bounds) == int:
-        gc_bounds = tuple([0, gc_bounds])
+    if isinstance(gc_bounds, int) or isinstance(gc_bounds, float):
+        gc_bounds = (0, gc_bounds)
+    if type(length_bounds) == int:
+        length_bounds = tuple([0, length_bounds])
-    if type(length_bounds) == int:
-        length_bounds = tuple([0, length_bounds])
+    if isinstance(length_bounds, int):
+        length_bounds = (0, length_bounds)
-    if type(length_bounds) == int:
-        length_bounds = tuple([0, length_bounds])
+    if isinstance(length_bounds, int):
+        length_bounds = (0, length_bounds)
+    for seq_name in seqs:
+        gc_check = False
+        length_check = False
+        quality_check = False
+        gc_check = check_gc(seqs[seq_name][0], gc_bounds)
+        length_check = check_length(seqs[seq_name][0], length_bounds)
+        quality_check = check_quality(seqs[seq_name][1], quality_threshold)
+        if gc_check and length_check and quality_check:
+            seqs_filtered[seq_name] = seqs[seq_name]
+    return seqs_filtered
-    for seq_name in seqs:
-        gc_check = False
-        length_check = False
-        quality_check = False
-        gc_check = check_gc(seqs[seq_name][0], gc_bounds)
-        length_check = check_length(seqs[seq_name][0], length_bounds)
-        quality_check = check_quality(seqs[seq_name][1], quality_threshold)
-        if gc_check and length_check and quality_check:
-            seqs_filtered[seq_name] = seqs[seq_name]
-    return seqs_filtered
+    for name, (seq, qual) in seqs.items():
+        gc_check = False
+        length_check = False
+        quality_check = False
+        gc_check = check_gc(seq, gc_bounds)
+        length_check = check_length(seq, length_bounds)
+        quality_check = check_quality(qual, quality_threshold)
+        if gc_check and length_check and quality_check:
+            seqs_filtered[name] = (seq, qual)
+    return seqs_filtered
-    for seq_name in seqs:
-        gc_check = False
-        length_check = False
-        quality_check = False
-        gc_check = check_gc(seqs[seq_name][0], gc_bounds)
-        length_check = check_length(seqs[seq_name][0], length_bounds)
-        quality_check = check_quality(seqs[seq_name][1], quality_threshold)
-        if gc_check and length_check and quality_check:
-            seqs_filtered[seq_name] = seqs[seq_name]
-    return seqs_filtered
+    for name, (seq, qual) in seqs.items():
+        gc_check = False
+        length_check = False
+        quality_check = False
+        gc_check = check_gc(seq, gc_bounds)
+        length_check = check_length(seq, length_bounds)
+        quality_check = check_quality(qual, quality_threshold)
+        if gc_check and length_check and quality_check:
+            seqs_filtered[name] = (seq, qual)
+    return seqs_filtered
+
+
+def run_beginner_bioinf_tools(*input_data: str or dict, toolbox: str = None, **kwargs: str) -> str or list or dict:
+    """
+    Performs various operations on nucleic acid, protein and fastq sequences
+    arguments:
+    - input_data (str or dict): data to be processed
+    - toolbox (str): determines which of the three toolkits is used
+    return:
+    - (str or list or dict): processed data
+    """
+    if toolbox == 'dna_rna':
+        return run_dna_rna_tools(input_data)
+    elif toolbox == 'proteins':
+        return run_protein_tools(input_data, kwargs['options'])
+    elif toolbox == 'fastq':
+        return run_fastq_filtration(input_data[0], gc_bounds = kwargs['gc_bounds'], length_bounds = kwargs['length_bounds'], quality_threshold = kwargs['quality_threshold'])
+    else:
+        raise ValueError('Invalid input: there is no toolbox corresponding to value ' + str(toolbox) + '!')
diff --git a/modules/dna_rna_constants.py b/modules/dna_rna_constants.py
@@ -0,0 +1,3 @@
+dna_rna_alphabet = ['A', 'a', 'T', 't', 'C', 'c', 'G', 'g', 'U', 'u']
+compl_dna = {'A': 'T', 'a': 't', 'T': 'A', 't': 'a', 'C': 'G', 'c': 'g', 'G': 'C', 'g': 'c'}
+compl_rna = {'A': 'U', 'a': 'u', 'U': 'A', 'u': 'a', 'C': 'G', 'c': 'g', 'G': 'C', 'g': 'c'}
diff --git a/modules/dna_rna_tools.py b/modules/dna_rna_tools.py
@@ -0,0 +1,92 @@
+def check_input(seqs: list, operation: str):
+    """
+    Check input for some of the most common errors
+    arguments:
+    - seqs (list): a list of sequences
+    - operation (str): operation name
+    return
+    - no return
+    """
+    from modules.dna_rna_constants import dna_rna_alphabet
+    for seq in seqs:
+        if type(seq) != str:
+            raise ValueError('Invalid input: all sequences must be of type str!')
+        t_present = False
+        u_present = False
+        for i in seq:
-        for i in seq:
+        for nucl in seq:
-        for i in seq:
+        for nucl in seq:
+            if i not in dna_rna_alphabet:
+                raise ValueError('Invalid input: sequences must contain only letters "A", "T", "C", "G" in either upper or lower case!')
+            if i == 'T' or i == 't':
+                t_present = True
+            if i == 'U' or i == 'u':
+                u_present = True
+        if t_present and u_present:
+            raise ValueError('Invalid input: sequences must not include both thymine and uracil!')
+        if operation == 'transcribe' and u_present:
+            raise ValueError('Invalid input: cannot transcribe RNA sequence(s).')
+
+
+def transcribe(seqs: list) -> list:
+    """
+    Produce a list of transcripts
+    arguments:
+    - seqs (list): a list of sequences
+    return
+    - transcripts (list): a list of transcripts
+    """
+    from modules.dna_rna_constants import compl_dna
+    transcripts = []
+    for seq in seqs:
+        transcript = []
+        for i in seq:
+            transcript.append(compl_dna[i])
+        transcripts.append(''.join(transcript))
-    for seq in seqs:
-        transcript = []
-        for i in seq:
-            transcript.append(compl_dna[i])
-        transcripts.append(''.join(transcript))
+    for seq in seqs:
+        transcript = ''.join(COMPL_DNA[nucl] for nucl in seq)
+        transcripts.append(transcript)
-    for seq in seqs:
-        transcript = []
-        for i in seq:
-            transcript.append(compl_dna[i])
-        transcripts.append(''.join(transcript))
+    for seq in seqs:
+        transcript = ''.join(COMPL_DNA[nucl] for nucl in seq)
+        transcripts.append(transcript)
+    return transcripts
+
+
+def reverse(seqs: list) -> list:
+    """
+    Produce a list of reverse sequences
+    arguments:
+    - seqs (list): a list of sequences
+    return
+    - reverse_seqs (list): a list of reverse sequences
+    """
+    reverse_seqs = []
+    for seq in seqs:
+        reverse_seqs.append(seq[::-1])
+    return reverse_seqs
+
+
+def complement(seqs: list) -> list:
+    """
+    Produce a list of complementary sequences
+    arguments:
+    - seqs (list): a list of sequences
+    return
+    - complement_seqs (list): a list of complementary sequences
+    """
+    from modules.dna_rna_constants import compl_dna, compl_rna
+    complement_seqs = []
+    for seq in seqs:
+        if 'U' in seq or 'u' in seq:
+            complement_seq = []
+            for j in seq:
+                complement_seq.append(compl_rna[j])
+        else:
+            complement_seq = []
+            for j in seq:
+                complement_seq.append(compl_dna[j])
+        complement_seqs.append(''.join(complement_seq))
+    return complement_seqs
+
+
+def reverse_complement(seqs: list) -> list:
+    """
+    Produce a list of reverse complementary sequences
+    arguments:
+    - seqs (list): a list of sequences
+    return
+    - (list): a list of reverse complementary sequences
+    """
+    return complement(reverse(seqs))
-    return complement(reverse(seqs))
+    return reverse(complement(seqs))
-    return complement(reverse(seqs))
+    return reverse(complement(seqs))
diff --git a/modules/fastq_constants.py b/modules/fastq_constants.py
@@ -0,0 +1 @@
+fastq_dna_code = ['A', 'T', 'G', 'C']
diff --git a/modules/fastq_filtration_tools.py b/modules/fastq_filtration_tools.py
@@ -0,0 +1,87 @@
+def check_fastq(seqs: dict):
+    """
+    Check fastq dictionary
+    arguments:
+    - seqs (dict): a fastq dictionary
+    return:
+    - no return
+    """
+    from modules.fastq_constants import fastq_dna_code
+    if type(seqs) != dict:
+        raise ValueError('Invalid input: a dict object was expected!')
+    for seq_name in seqs:
+        if type(seqs[seq_name][0]) != str:
+            raise ValueError('Invalid input: sequences must be of type str!')
+        if len(seqs[seq_name][0]) == 0:
+            raise ValueError('Invalid input: sequences must be at least one nucleotide long!')
+        for i in seqs[seq_name][0]:
+            if i not in fastq_dna_code:
+                raise ValueError('Invalid input: sequences must contain only letters "A", "T", "G", "C" in upper case!')
+        if seq_name[0] != '@':
+            raise ValueError('Invalid input: sequence names are incorrect!')
+
+
+def check_fastq(seqs: dict):
+    """
+    Check fastq dictionary
+    arguments:
+    - seqs (dict): a fastq dictionary
+    return:
+    - no return
+    """
+    from modules.fastq_constants import fastq_dna_code
+    if type(seqs) != dict:
+        raise ValueError('Invalid input: a dict object was expected!')
+    for seq_name in seqs:
+        if type(seqs[seq_name][0]) != str:
+            raise ValueError('Invalid input: sequences must be of type str!')
+        if len(seqs[seq_name][0]) == 0:
+            raise ValueError('Invalid input: sequences must be at least one nucleotide long!')
+        for i in seqs[seq_name][0]:
+            if i not in fastq_dna_code:
+                raise ValueError('Invalid input: sequences must contain only letters "A", "T", "G", "C" in upper case!')
+        if seq_name[0] != '@':
+            raise ValueError('Invalid input: sequence names are incorrect!')
+
+
+def check_gc(seq: str, gc_bounds: tuple or int) -> bool:
+    """
+    Check how GC content of a sequence corresponds to the range provided
+    arguments:
+    - seq (str): a sequence
+    - gc_bounds (tuple or int): the range in which GC content must vary
+    return:
+    - (bool): whether GC content of a sequence is in the range provided
+    """
+    gc_sum = 0
+    for nucleotide in seq:
+        if nucleotide == 'G' or nucleotide == 'C':
+            gc_sum += 1
+    return gc_bounds[0] <= gc_sum/len(seq)*100 <= gc_bounds[1]
+
+
+def check_length(seq: str, length_bounds: tuple or int) -> bool:
+    """
+    Check how length of a sequence corresponds to the range provided
+    arguments:
+    - seq (str): a sequence
+    - length_bounds (tuple or int): the range in which length must vary
+    return:
+    - (bool): whether length of a sequence is in the range provided
+    """
+    return length_bounds[0] <= len(seq) <= length_bounds[1]
+
+
+def check_quality(quality: str, quality_threshold: int):
+    """
+    Check how average quality of a sequence corresponds to the lower limit provided
+    arguments:
+    - quality (str): a sequence describing the quality for each nucleotide in sequence
+    - quality_threshold (int): the lower limit for average quality
+    return:
+    - (bool): whether average quality of a sequence is equal or higher than the lower limit provided
+    """
+    quality_sum = 0
+    for i in quality:
+        quality_sum += ord(i) - 33
+    return quality_sum/len(quality) >= quality_threshold
diff --git a/modules/protein_constants.py b/modules/protein_constants.py
@@ -0,0 +1,34 @@
+retranslation_dict = {
+        'F': 'TTC', 'f': 'ttc',
+        'L': 'TTA', 'l': 'tta',
+        'S': 'TCG', 's': 'tcg',
+        'Y': 'TAC', 'y': 'tac',
+        'C': 'TGC', 'c': 'tgc',
+        'W': 'TGG', 'w': 'tgg',
+        'P': 'CCC', 'p': 'ccc',
+        'H': 'CAT', 'h': 'cat',
+        'Q': 'GAA', 'q': 'gaa',
+        'R': 'CGA', 'r': 'cga',
+        'I': 'ATT', 'i': 'att',
+        'M': 'ATG', 'm': 'atg',
+        'T': 'ACC', 't': 'acc',
+        'N': 'AAT', 'n': 'aat',
+        'K': 'AAA', 'k': 'aaa',
+        'V': 'GTT', 'v': 'gtt',
+        'A': 'GCA', 'a': 'gca',
+        'D': 'GAT', 'd': 'gca',
+        'E': 'GAG', 'e': 'gag',
+        'G': 'GGG', 'g': 'ggg'
+    }
+
+threel = {'A': 'ALA', 'R': 'ARG', 'N': 'ASN', 'D': "ASP", 'V': 'VAL',
+                 'H': 'HIS', 'G': "GLY", 'Q': "GLN", 'E': 'GLU', 'I': 'ILE',
+                 'L': 'LEU', 'K': 'LYS', 'M': 'MET', 'P': 'PRO', 'S': 'SER',
+                 'Y': 'TYR', 'T': 'THR', 'W': 'TRP', 'F': 'PHE', 'C': 'CYS',
+                 'a': 'ala', 'r': 'arg', 'n': 'asn', 'd': "asp", 'v': 'val',
+                 'h': 'his', 'g': "gly", 'q': "gln", 'e': 'glu', 'i': 'ile',
+                 'l': 'leu', 'k': 'lys', 'm': 'met', 'p': 'pro', 's': 'ser',
+                 'y': 'tyr', 't': 'thr', 'w': 'trp', 'f': 'phe', 'c': 'cys'
+    }
+
+aminoacids = ['F', 'f', 'L', 'l', 'S', 's', 'Y', 'y', 'C', 'c', 'W', 'w', 'P', 'p', 'H', 'h', 'Q', 'q', 'R', 'r', 'I', 'i', 'M', 'm', 'T', 't', 'N', 'n', 'K', 'k', 'V', 'v', 'A', 'a', 'D', 'd', 'E', 'e', 'G', 'g']