diff --git a/Bio_Files_Processor.py b/Bio_Files_Processor.py new file mode 100644 index 0000000..3872474 --- /dev/null +++ b/Bio_Files_Processor.py @@ -0,0 +1,126 @@ +import os +from typing import List + +def convert_multiline_fasta_to_oneline(input_fasta: str, output_fasta: str = None) -> str: + """ + Function conver multiline fasta file into fasta with name line marked by '>' and the other line with sequence. + Results save into new folder "Converted_data". If the folder already exists, new data is written to it. + + :param input_fasta: Path to fasta-file with your seqs with. + !! It is necessary to indicate the name along with extensions (.fasta) !! + :type input_fasta: str + :param output_fasta: Name of new fasta-file with your seqs in one line, default value = None + It is necessary to indicate the name along with extensions (.fasta) + :type output_fasta: str + :rtype: str + :return: Script completion message + """ + if input_fasta.find('.fasta') == 0: + raise ValueError(f'Wrong file format in input!') + if os.path.exists(os.path.join('.', 'Converted_data')) == False: + os.mkdir(os.path.join('.', 'Converted_data')) + if output_fasta == None: + out_name = 'one_line_' + input_fasta.strip("/")[-1] + else: + out_name = output_fasta + gene = 0 + seq = 0 + gene_name = '' + prot_seq = '' + gene_and_seq = dict() + with open (input_fasta) as seq_fasta: + for line in seq_fasta: + if gene == 1: + if line.startswith('>') == False: + gene_and_seq[gene_name] += line.strip('\n') + else: + gene_name = line.strip('\n') + gene_and_seq[gene_name] = '' + if gene == 0 and line.startswith('>'): + gene_name = line.strip('\n') + gene_and_seq[gene_name] = '' + gene = 1 + with open (os.path.join('.', 'Converted_data', out_name), mode = 'w') as new_seq_fasta: + for key, item in gene_and_seq.items(): + new_seq_fasta.write(key + '\n') + new_seq_fasta.write(item + '\n') + return 'All sequences processed!' + + +def select_genes_from_gbk_to_fasta(input_gbk: str, genes: List[str], n_before: int = 1, n_after: int = 1, + output_fasta: str = None) -> str: + ''' + Function help to search neighbours of GOI (gene of interest). Function writes neighbours of GOI in new FASTA-file + as: name of gene, protein sequence. Results save into new folder "Analyzed_data". If the folder already exists, + new data is written to it. + + :input_gbk: Path to gbk-file with your seqs with. + !! It is necessary to indicate the name along with extensions (.gbk) !! + :type input_gbk: str + :param genes: Gene of interest names + :type genes: List[str] + :param n_before: number of genes before GOI (>0), default value = 1 + :type n_before: int + :param n_after: number of genes after GOI (>0), default value = 1 + :type n_after: int + :output_fasta: Name of FASTA-file with neighbours of GOI (names and seqs), default value = None + :type output_fasta: str + :rtype: str + :return: Script completion message + + ''' + if input_gbk.find('.gbk') == 0: + raise ValueError(f'Wrong file format in input!') + if os.path.exists(os.path.join('.', 'Analyzed_data')) == False: + os.mkdir(os.path.join('.', 'Analyzed_data')) + genes_for_search = genes + genes_gbk = [] + genes_for_search_in_gbk = [] + neighbour_genes = dict() + if output_fasta == None: + output_fasta = 'output_for_gbk.fasta' + with open (input_gbk) as gbk: + for line in gbk: + if '/gene' in line: + genes_gbk += [line.strip().split('=')[1]] + for el in genes_for_search: + genes_for_search_in_gbk += [gn for gn in genes_gbk if el in gn] + for gene in genes_for_search_in_gbk: + gene_index = genes_gbk.index(gene) + if gene_index >= 0 and gene_index < (len(genes_gbk) - 1): + for i in range(1, n_before + 1): + neighbour_genes[(genes_gbk[gene_index - i])] = 0 + for i in range(1, n_after + 1): + neighbour_genes[(genes_gbk[gene_index + i])] = 0 + else: + for i in range(1, n_before + 1): + neighbour_genes[(genes_gbk[gene_index - i])] = 0 + for i in range(1, n_after): + neighbour_genes[(genes_gbk[0 + i])] = 0 + with open (input_gbk) as gbk: + gene_name_read = 0 + protein_read = 0 + gene_name = '' + protein = '' + for line in gbk: + if '/gene' in line: + gene_name = line.strip().split('=')[1] + if gene_name in neighbour_genes: + gene_name_read = 1 + if protein_read == 1: + protein += line.strip('\n').strip(' ') + if '"' in line: + protein_read = 0 + gene_name_read = 0 + neighbour_genes[gene_name] = protein + protein = '' + if gene_name_read == 1 and '/translation' in line: + protein_read = 1 + protein += line.strip().split('=')[1] + with open (os.path.join('.', 'Analyzed_data', output_fasta), mode = 'w') as fasta: + for name, seq in neighbour_genes.items(): + name = '>'+name.replace('\"','') + fasta.write(name + '\n') + fasta.write(seq.replace('\"','') + '\n') + return 'All sequences processed!' + diff --git a/Bio_Seq_Analysis_Tool.py b/Bio_Seq_Analysis_Tool.py new file mode 100644 index 0000000..bac8f59 --- /dev/null +++ b/Bio_Seq_Analysis_Tool.py @@ -0,0 +1,348 @@ +import os +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio.SeqUtils import gc_fraction +from typing import Union, Tuple, Dict +from abc import ABC, abstractmethod + + + +def analyse_gc(records: str, min_gc: Union[int, float], max_gc: Union[int, float]) -> str: + """ + Return filtered FASTQ-sequences by GC-content + + :param records: FASTQ-file + :type records: str + :param min_gc: Left boundary for GC-content filtration + :type min_gc: Union[int, float] + :tupe max_gc: Right boundary for GC-content filtration + :type max_gc: Union[int, float] + :rtype: str + :return: filtered FASTQ-sequences + """ + return {record.id: record for record in records if (min_gc/100) <= gc_fraction(record.seq) <= (max_gc/100)} + + +def filter_by_length(records: str, min_length: Union[int, float], max_length: Union[int, float]) -> str: + """ + Return filtered FASTQ-sequences by GC-content + + :param records: FASTQ-file + :type records: str + :param min_gc: Left boundary for GC-content filtration + :type min_gc: Union[int, float] + :tupe max_gc: Right boundary for GC-content filtration + :type max_gc: Union[int, float] + :rtype: str + :return: filtered FASTQ-sequences + """ + + return {record.id: record for record in records if min_length <= len(record.seq) <= max_length} + + +def filter_by_quality(records: str, quality_threshold: Union[int, float]) -> str: + """ + Return filtered FASTQ-sequences by quality + :param records: Sequnces filtered by GC-content and length + :type records: str + :param quality_threshold: boundary for quality filtration + :type quality_threshold: Union[int, float] + :rtype: str + :return: filtered FASTQ-sequences + """ + + return {record.id: record for record in records if min(record.letter_annotations["phred_quality"]) >= quality_threshold} + + + +def write_filtered_sequences_to_fastq(filtered_sequences: Dict[str, str], unfiltered_sequences: Dict[str, str], + output_file: str, folder_path: str = 'fastq_filtrator_results') -> str: + ''' + The function writes filtered FASTQ reads into new file and save it into folder "fastq_filtrator_resuls" + :param filtered_sequences: Dict of filtered sequences by GC-content, quality and length + :filtered_sequences type: Dict[str] + :param unfiltered_sequences: Dict of unfiltered sequences by GC-content, quality and length + :filtered_sequences type: Dict[str] + :output_file: name of output file + :output_file type: str + :folder_path: name of result folder, default = 'fastq_filtrator_resuls' + :folder_path type: str + :rtype: None + :return: None + ''' + + if not os.path.exists(folder_path): + os.makedirs(folder_path) + + file_path_filtered = os.path.join(folder_path, output_file) + file_path_not_filtered = os.path.join(folder_path, 'unfiltered_sequences.fasta') + + with open(file_path_filtered, "w") as output_handle: + for key, value in filtered_sequences.items(): + records_filtered = [] + sequence = Seq(value) + record = SeqRecord(sequence, id=key, description="") + records_filtered.append(record) + SeqIO.write(records_filtered, output_handle, "fasta") + + with open(file_path_not_filtered, "w") as output_handle: + for key, value in unfiltered_sequences.items(): + records_unfiltered = [] + sequence = Seq(value) + record = SeqRecord(sequence, id=key, description="") + records_unfiltered.append(record) + SeqIO.write(records_unfiltered, output_handle, "fasta") + + +def filter_fastq(input_path: str, + gc_bounds: Union[int, float, Tuple [int], Tuple [float]] = (0, 100), + length_bounds: Union[int, Tuple [int]] = (0, 2**32), + quality_threshold: float = 0.0, filtered_file_name: Union[None, str] = None) -> Dict[str,str]: + """ + This function help analyze a set of reads obtained from next-generation sequencing. + + The function allow to filter the desired reads according to three parameters: + GC-content, length and reading quality. + + :param seqs: + Path to the file with FASTQ-sequences in the format. + :type seqs: str + + :param gc_bounds: + Boundary parameters for filtering sequences by GC-content. Save only reads with a GC-content between boundaries + or lower than one boundary. Lower boundary cannot be less than 0 and upper boundary cannot be greater than 100. + gc_bounds default value is (0,100). + :type param gc_bounds: Union[int, float, Tuple [int], Tuple [float]] + + :param length_bounds: + Boundary parameters for filtering sequences by length. Works the same as gc_bounds. Lower boundary cannot be less + than 0 and upper boundary cannot be greater than 2^32. length_bounds default value is (0,2^32) + :type param length_bounds: Union[int, Tuple [int] + + :param quality_threshold: + Threshold for quality of each nucleotide in read. Quality incodes by ASCII codes. The threshold cannot be more + than 40. quality_threshold default value is 0 + :type param quality_threshold: float + + :return: + New dictionaries with fastq sequence.The first one consisting of filtered fastq sequences and the other one with + sequences that did not pass filters. + :rtype: Dict[str] + + :raises ValueError: if sequence not RNA or DNA, also if the argument values are outside the allowed ones + """ + + if type(gc_bounds) == float or type(gc_bounds) == int: + gc_bounds = (0, gc_bounds) + if gc_bounds[0] < 0 or gc_bounds[1] > 100: + raise ValueError(f'Wrong boundaries!') + min_gc, max_gc = gc_bounds + if type(length_bounds) == int: + length_bounds = (0,length_bounds) + if length_bounds[0] < 0 or length_bounds[1] > 2**32: + raise ValueError(f'Wrong boundaries!') + min_length, max_length = length_bounds + if quality_threshold > 40: + raise ValueError(f'Wrong quality threshold!') + + records = list(SeqIO.parse(input_path, "fastq")) + + filtered_by_gc = analyse_gc(records, min_gc, max_gc) + + filtered_by_length = filter_by_length(filtered_by_gc.values(), min_length, max_length) + + filtered_by_quality = filter_by_quality(filtered_by_length.values(), quality_threshold) + + filtered_seq = {record_id: str(record.seq) for record_id, record in filtered_by_quality.items()} + unfiltered_seq = {record.id: str(record.seq) for record in records if record.id not in filtered_seq} + + if filtered_file_name == None: + new_file_name = "filtered_sequences.fasta" + else: + new_file_name = filtered_file_name + + write_filtered_sequences_to_fastq(filtered_seq, unfiltered_seq, new_file_name) + + return ('Sequences are filtered!') + + +class BiologicalSequence(ABC): + ''' + Abstract class for different biological sequences + ''' + + @abstractmethod + def __len__(self): + ''' + Method for working with the Python len function. !Needs to be overridden in child class! + ''' + + pass + + @abstractmethod + def __getitem__(self): + ''' + Method for get elements by index and slice the sequence. !Needs to be overridden in child class! + ''' + + pass + + @abstractmethod + def __str__(self): + ''' + Method for convertion sequence to a string. !Needs to be overridden in child class! + ''' + + pass + + @abstractmethod + def is_alphabet_correct(self): + ''' + Method for checking that a sequence is written correctly. + ''' + pass + + +class NucleicAcidSequnce(BiologicalSequence): + ''' + Class for DNA or RNA molecules + ''' + + def __init__(self, seq) -> None: + self.seq = seq + self.dna_alphabet = set('AaTtGgCc') + self.rna_alphabet = set('AaUuGgCc') + self.complement_dict = {'A': 'T', 'C': 'G', + 'G': 'C', 'T': 'A', 'U': 'A', 'a': 't', + 'c': 'g', 'g': 'c', 't': 'a', 'u': 'a'} + + + def __len__(self) -> int: + return len(self.seq) + + + def __getitem__(self, item) -> int: + return self.seq[item] + + + def __str__(self) -> str: + return self.seq + + + + def is_alphabet_correct(self) -> bool: + + ''' + Method for checking of standard nucleotide content in sequence + + :param self: DNA or RNA sequence + ''' + + if (set(self.seq).issubset(self.dna_alphabet) and isinstance(self, DNASequence)) or (set(self.seq).issubset(self.rna_alphabet) and isinstance(self, RNASequence)): + return True + raise TypeError(f'{self.seq} is not correct nucleic acid') + + + def complement(self): + """ + Function return complement sequence. + + :param self: DNA or RNA sequence + :rtype: str + :return: complement sequence + """ + if self.is_alphabet_correct(): + complement_seq = str() + length = len(self.seq) + for i in range (length): + if self.seq[i] in self.complement_dict: + complement_seq += (self.complement_dict[self[i]]) + if isinstance(self, DNASequence): + return DNASequence(complement_seq) + if isinstance(self, RNASequence): + return RNASequence(complement_seq) + + + def gc_calculate(self) -> float: + """ + Function return sequence GC-content in percent. + + :param seq: DNA or RNA sequence + :type seq: str + :rtype: float + :return: GC-contentn percent + """ + length = len(self.seq) + gc_content = 0.0 + seq_up = self.seq.upper() + c = seq_up.count("C") + g = seq_up.count("G") + gc_content = round(((c+g)/length*100),2) + return gc_content + + +class DNASequence(NucleicAcidSequnce): + ''' + Class for DNA sequence + ''' + + def __init__(self, seq) -> None: + super().__init__(seq) + + def transcribe(self): + ''' + Method return transcribed sequence. + ''' + + if super().is_alphabet_correct(): + return RNASequence(self.seq.replace('T', 'U').replace('t', 'u')) + + +class RNASequence(NucleicAcidSequnce): + ''' + Class for RNA sequence + ''' + + def __init__(self, seq) -> None: + super().__init__(seq) + + +class AminoAcidSequence (BiologicalSequence): + ''' + Class for protein sequence + ''' + + def __init__(self, seq): + self.seq = seq + self.protein_alphabet = set('ACDEFGHIKLMNPQRSTVWY') + + + def __len__(self): + return len(self.seq) + + + def __getitem__(self, item): + return self.seq[item] + + + def __str__(self): + return self.seq + + + def is_alphabet_correct(self): + if set(self.seq).issubset(self.protein_alphabet): + return True + raise TypeError(f'{self.seq} is not a protein') + + def calculate_protein_mass(self): + ''' + Method return mass of residues in seq in Da. + ''' + + if self.is_alphabet_correct: + weights = {'A': 89.09, 'R': 174.20, 'N': 132.12, 'D': 133.10, 'C': 121.15, + 'E': 147.13, 'Q': 146.15, 'G': 75.07, 'H': 155.16, 'I': 131.17, + 'L': 131.17, 'K': 146.19, 'M': 149.21, 'F': 165.19, 'P': 115.13, + 'S': 105.09, 'T': 119.12, 'W': 204.23, 'Y': 181.19, 'V': 117.15} + return sum(weights.get(aa, 0) for aa in self.seq) diff --git a/README.md b/README.md index 602c1e1..f81dc14 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,564 @@ -# BSAT -BSAT - Biological Sequences Analysis Toolbox. Repo contains tools which can help work with nucleic acid or protein sequences and with NGS-reads +# BSAT - Biological Sequences Analysis Toolbox + +This repository contains tools which helps you work with nucleic acid or protein sequences and with NGS-reads. It is capable to process multiple sequences, that makes analysis faster. + +## Installation + +To use this toolbox one need to clone repository + +```shell +git clone git@github.com:grishchenkoira/BSAT.git +cd BSAT +``` + +### System requirements: + +Key packages and programs: +- [Python](https://www.python.org/downloads/) (version >= 3.9) + +## Usage + +```python +# import main function Bio_Seq_Analysis_Tool +from Bio_Seq_Analysis_Tool import Bio_Seq_Analysis_Tool + +# import main function Bio_Files_Processor +from Bio_Files_Processor import Bio_Files_Processor +``` + +## Works with main functions + +This section contains description of two main scripts: Bio_Files_Processor and Bio_Seq_Analysis_Tool + +## Bio_Files_Processor + +Functions from this script help you to analyze genomic data from different database saved into standard format of these storages. + +### convert_multiline_fasta_to_oneline(input_fasta: str, output_fasta: str = None) -> str + +Function conver multiline fasta file into fasta with name line marked by '>' and the other line with sequence. Results save in new file into new folder "Converted_data". If the folder already exists, new data is written to it. + +**Parameters:** + +-**input_fasta**: *str* + +Path to fasta-file with your seqs with. !! It is necessary to indicate the name along with extensions (.fasta) !! In function exist check for correct format. + +-**output_fasta**: *str* + +Name of new fasta-file with your seqs in one line, default value = None. It is necessary to indicate the name along with extensions (.fasta) + +**Returns:** + +Script completion message + +**Example** + +```python +convert_multiline_fasta_to_oneline('./data_example/example_multiline_fasta.fasta') # 'All sequences processed!' +``` +Structure of new file with FASTA-seq: +'>_name: +sequence' + +### select_genes_from_gbk_to_fasta(input_gbk: str, genes: List[str], n_before: int = 1, n_after: int = 1, output_fasta: str = None) -> str + +Function help to search neighbours of GOI (gene of interest). Function writes neighbours of GOI in new FASTA-file as: name of gene, protein sequence. Results save into new folder "Analyzed_data". If the folder already exists, new data is written to it. + +**Parameters:** + +-**input_gbk**: *str* + +Path to gbk-file with your seqs with. !! It is necessary to indicate the name along with extensions (.gbk) !! In function exist check for correct format. + +-**genes**: *List[str]* + +Gene of interest names. + +Example of input: +```python +['pndA'] +``` +-**n_before**: *int* + +Number of genes before GOI (>0), default value = 1 + +-**n_after**: *int* + +number of genes after GOI (>0), default value = 1 + +-**output_fasta**: *str* + +Name of new fasta-file with your seqs, default value = None. It is necessary to indicate the name along with extensions (.fasta) + +**Returns:** +Script completion message + +**Example** +```python +select_genes_from_gbk_to_fasta('.\\example_data\\example_gbk.gbk', ['pndA'], 2, 2) # 'All sequences processed!' +``` +Structure of new file with FASTA-seq: +'> gene_name: +protein_sequence' + +## Bio_Seq_Analysis_Tool + +Main functions from this script help you to analyze different types of bio sequences: DNA, RNA, NGS-reads, protein + +### dna_rna_analysis(*args: str, operation: str) + +This function performs a number of operations on DNA or RNA. +Operations supported by this functions: +- transcribe - return transcribed sequence +- reverse - return reverse sequence +- complement - return complement sequence +- reverse_complement - return reverse complement sequence +- gc_calculate - return sequence GC-content in percent + +**Parameters:** +- **args**: *str* + +Nucleic acid sequence +- **operation**: *str* + +Type of operation required + +**Returns**: +- **analysis**: *str* + +Analysis of nucleic acid sequence + +### analyse_fastq(seqs, gc_bounds, length_bounds, quality_threshold) + +Apply one of the operations described below to fastq sequences. + +**Parameters:** +- **seqs**: *dict* + +A dictionary consisting of fastq sequences. The structure is as follows: Key - string, sequence name. The value is a tuple of two strings: sequence and quality. The sequence is RNA or DNA. + +- **gc_bounds**: *Union[int, float, Tuple [int], Tuple [float]]* + +Boundary parameters for filtering sequences by GC-content. Save only reads with a GC-content between boundaries or lower than one boundary. Lower boundary cannot be less than 0 and upper boundary cannot be greater than 100. gc_bounds default value is (0,100). + +- **length_bounds** : *Union[int, Tuple [int]]* + +Boundary parameters for filtering sequences by length. Works the same as gc_bounds. Lower boundary cannot be less than 0 and upper boundary cannot be greater than 2^32. length_bounds default value is (0,2^32). + +- **quality_threshold** : *float* + +Threshold for quality of each nucleotide in read. Quality incodes by ASCII codes. The threshold cannot be more than 40. quality_threshold default value is 0 + +**Returns**: +- **analysed_seq**: *Dict[str]* + +New dictionary with fastq sequence.This one consists of filtered fastq sequences and + +- **analysed_seq**: *Dict[str, str]* + +New dictionary with fastq sequence. This one consists of sequences that did not pass filters. + +### run_protein_analysis(*args: str) + +Apply operations described below to any number of sequences with any case. + +**Parameters:** +**\*args**: +- **sequences**: *str* + +input coma-separated sequences in 1-letter or 3-letter code with any case (as many as you wish) +- **add_arg**: *str* + +necessary parameter for certain functions (for example, specify target protein site) +- **procedure** : *str* + +specify procedure you want to apply + +**Returns**: +- **operation_result**: str or list + +result of function work in list or str format (dependent on number of input sequences) + +**Note!** +- Operation name always must be the last argument +- Additional argument must be always before operation name + +## Modules + +This section contains description of modules using by main functions you can find in our library. + +- DNA & RNA analysis tool(#title1) +- FASTQ analysis tool(#title2) +- Amino acid sequences analysis tool(#title3) + +### DNA & RNA analysis + +This module performs a number of operations on DNA or RNA. + +#### Operations + +##### transcribe(seq) + +Function return return transcribed sequence. + +**Parameters:** +- **seq**: *str* + +DNA sequence + +**Returns:** +- **rna_seq**: *str* + +**Example** +```python +run_dna_rna_tools('ATG', 'transcribe') # 'AUG' +``` + + +##### reverse(seq) + +Function return return reversed sequence. + +**Parameters:** +- **seq**: *str* + +DNA or RNA sequence + +**Returns:** +- **reverse_seq**: *str* + +**Example** +```python +run_dna_rna_tools('ATG', 'reverse') # 'GTA' +``` + +##### complement(seq) + +Function return return complement sequence. + +**Parameters:** +- **seq**: *str* + +DNA or RNA sequence + +**Returns:** +- **complement_seq**: *str* + +**Example** +```python +run_dna_rna_tools('AtG', 'complement') # 'TaC' +``` + +##### reverse_complement(seq) + +Function return return reverse complement sequence. + +**Parameters:** +- **seq**: *str* + +DNA or RNA sequence + +**Returns:** +- **reverse_complement_seq**: *str* + +**Example** +```python +run_dna_rna_tools('ATg', 'reverse_complement') # 'cAT' +``` + +##### gc_calculate(seq) + +Function return sequence GC-content in percent. + +**Parameters:** +- **seq**: *str* + +DNA or RNA sequence + +**Returns:** +- **gc_content**: *str* + +**Example** +```python +run_dna_rna_tools ('GTAccca','gc_calculate') # '57.14' +``` + + +### FASTQ analysis tool + +This module contains functions for FASTQ sequnces filtration. The function allow to filter the desired reads according to three parameters: GC-content, length and reading quality. + +#### Operations + +##### analyse_gc(seq) + +Return GC-content of DNA/RNA sequence. + +**Parameters:** + +- **seq**: *str* + +DNA/RNA sequence + +**Returns:** +- **gc_content**: *float* + +##### analyse_length(seq) + +Return length of DNA/RNA sequence + +**Parameters:** + +- **seq**: *str* + +DNA/RNA sequence + +**Returns:** +- **length**: *int* + +##### analyse_quality(seq) + +Return quality score of read, that coding by ASCII code + +**Parameters:** + +- **seq**: *str* + +quality symbols for each nucleotide + +**Returns:** +- **q_score_sum**: *float* + + +### Amino acid sequences analysis tool + +This module contains functions for protein sequences analysis. You can reencode peptides sequences: 1-letter to 3-letter code and vice versa, calculate physical features, find specific sites, get predicted mRNA that coding your protein. + +#### Operations + +##### change_residues_encoding(seq, query='one') + +Transfer amino acids from 3-letter to 1-letter code and vice versa. + +**Parameters:** + +- **seq**: *str* + +Input protein seq in any encoding and case. If the input is a sequence of amino acids written in a three-letter code, then the amino acids must be separated by a space. If the input is a sequence of amino acids written in a single-letter code, then the amino acids may not be separated by a space. + +- **encoding**: {'one', 'three'}, default: 'one' + +specify target encoding + +**Returns:** +- **encode_seq_registered**: *str* + +same protein seq in another encoding + +**Example** +```python +seq = 'AAA' +change_residues_encoding(seq, 'one', 'change_residues_encoding') # 'AAA' + +seq = 'ALA ALA ALA' +change_residues_encoding(seq, 'one', 'change_residues_encoding') # 'AAA' + +seq = 'AAA' +change_residues_encoding(seq, 'three', 'change_residues_encoding') # 'ALA ALA ALA' +``` + +##### is_protein(seq) + +Check if sequence is protein or not by identify invalid seq elements, which are not presented in dicts above. + +**Parameters:** +- **seq**: *str* + +input protein seq in 1-letter encoding and upper case + +**Returns:** +- **verification_result**: *bool* + +if seq is correct protein seq or not + +**Example** +```python +seq = 'AAA' +is_protein(seq) #True +``` + +##### get_seq_characteristic(seq) + +Count entry of each residue type in your sequence + +**Parameters:** +- **seq**: *str* + +input protein seq in 1-letter encoding and upper case + +**Returns:** +- **res_count**: *dict* + +each residue type in seq in 3-letter code and its amount in current seq + +**Example** +```python +seq = 'AAA' +get_seq_characteristic(seq) #{'ALA': 3} +``` + +##### find_res(seq, res_of_interest) + +Find all positions of certain residue in your seq + +**Parameters:** +- **seq**: *str* + +input protein seq in 1-letter encoding and upper case +- **res_of_interest**: *str* + +residue of interest in 1-letter encoding and upper case + +**Returns:** +- **res_positions**: *str* + +positions of specified residue in your seq + +**Example** +```python +seq = 'AAA' +res = 'A' +find_res(seq, res) # 'A positions: [1, 2, 3]' +``` + +##### find_site(seq, site) + +Find if seq contains certain site and get positions of its site + +**Parameters:** +- **seq**: *str* + +input protein seq in 1-letter encoding and upper case + +- **site**: *str* + +specify site of interest + +**Returns:** +- **site_positions**: *str* + +the range of values for amino acid positions of specified site in your seq in which the last number is excluded + +**Example** +```python +seq = 'AAADDDF' +site = 'AAA' +find_site(seq, site) # "Site entry in sequence = 1. Site residues can be found at positions: ['1:4']" +``` + +##### calculate_protein_mass(seq) + +Get sum of residues masses in your seq in Da + +**Parameters:** +- **seq**: *str* + +input protein seq in 1-letter encoding and upper case + +**Returns:** +- **total_mass**: *float* + +mass of all residues in seq in Da + +**Example** +```python +seq = 'AAA' +calculate_protein_mass(seq) #267 +``` + +##### calculate_average_hydrophobicity(seq) + +Get average hydrophobicity index for protein seq as sum of index for each residue in your seq divided by its length + +**Parameters:** +- **seq**: *str* + +input protein seq in 1-letter encoding and upper case + +**Returns:** +- **average_hydrophobicity_idx**: *float* + +average hydrophobicity index for your seq + +**Example** +```python +seq = 'AAA' +calculate_average_hydrophobicity(seq) #1.8 +``` + +##### get_mrna(seq) + +Get encoding mRNA nucleotides for your seq + +**Parameters:** +- **seq**: *str* + +input protein seq in 1-letter encoding and upper case + +**Returns:** +- **mrna_seq**: *str* + +potential encoding mRNA sequences with multiple choice for some positions + +**Example** +```python +seq = 'AAA' +get_mrna(seq) # ['GCN', 'GCN', 'GCN'] +``` + +##### calculate_isoelectric_point(seq) + +Find isoelectrinc point as sum of known pI for residues in your seq + +**Parameters:** +- **seq**: *str* + +input protein seq in 1-letter encoding and upper case + +**Returns:** +- **pi**: *float* + +isoelectric point for your seq + +**Example** +```python +seq = 'AAA' +calculate_isoelectric_point(seq) # 6.01 +``` +##### analyze_secondary_structure(seq) + +Calculates the percentage of amino acids found in the three main types of protein secondary structure: beta-turn, beta-sheet and alpha-helix in your seq + +**Parameters:** +- **seq**: *str* + +input protein seq in 1-letter encoding and upper case + +**Returns:** +- **result**: *list* + +percentage of amino acids belonging to three types of secondary structure for seq + +**Example** +```python +seq = 'AAA' +analyze_secondary_structure(seq) # [0.0, 0.0, 100.0] +``` + +## Contact + +*This is the repo for the 5th homework of the BI Python 2023 course* + +Author: +- *Grishenko Irina* \ No newline at end of file