Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
91e416f
Innitial commit for dna_rna_analysis.py
grishchenkoira Oct 7, 2023
52772ec
Innitial commit for fastq_analysis.py
grishchenkoira Oct 7, 2023
edf755b
Innitial commit for protein_analysis
grishchenkoira Oct 7, 2023
b247115
Add python script with all functions for this module
grishchenkoira Oct 7, 2023
78dc3b5
Add python script with all functions for this module
grishchenkoira Oct 7, 2023
c6ade48
Add python script with all functions for protein module
grishchenkoira Oct 7, 2023
caee4ae
Initail commit for main script of Bio_Seq_Analysis_Tool
grishchenkoira Oct 7, 2023
10e061b
Add forder with required modules for Bio_Seq_Analysis_Tool.py
grishchenkoira Oct 7, 2023
1de06fd
Add python script with all functions into Bio_Seq_Analysis_Tool
grishchenkoira Oct 7, 2023
be10c57
Add README for Bio_Seq_Analysis_Tool module with detailed description…
grishchenkoira Oct 7, 2023
247f2ab
Add def for read FASTQ-seq and def for creating file with filtered FA…
grishchenkoira Oct 11, 2023
f24c092
Add into def analyze_fastq reading data from a file and writing retur…
grishchenkoira Oct 11, 2023
8222e3f
Fixs bag in def read_fastq and write_fastq
grishchenkoira Oct 11, 2023
8333b66
Fix bags in def analyse_fastq
grishchenkoira Oct 11, 2023
c7b0739
Include boundaries for analysis in fastq_analysis in Bio_seq_Analysis…
grishchenkoira Oct 17, 2023
6c5a702
Initial commit for Bio_Files_Processor.py
grishchenkoira Oct 17, 2023
8665b41
Add import of standard modules in Bio_Files_Processor.py
grishchenkoira Oct 17, 2023
f9af641
Add function 'convert_multiline_fasta_to_oneline' into Bio_Files_Proc…
grishchenkoira Oct 17, 2023
16fa99a
Add function 'select_genes_from_gbk_to_fasta' into Bio_Files_Processo…
grishchenkoira Oct 17, 2023
52eceb8
Add corrections to the description of the functions into Bio_Files_Pr…
grishchenkoira Oct 17, 2023
4735049
Fix output_fasta parametr in 'Convert...' function into Bio_Files_Pro…
grishchenkoira Oct 17, 2023
fea880d
Fix input parametr in 'Convert...' function into Bio_Files_Processor.py
grishchenkoira Oct 17, 2023
0aea07f
Add data format check in select_genes_from_gbk_to_fasta
grishchenkoira Oct 17, 2023
640a391
Add data format check in convert_multiline_fasta_to_oneline
grishchenkoira Oct 17, 2023
180e726
Add information about Bio_Files_Processor.py into README.md
grishchenkoira Oct 17, 2023
db4d2ea
Delete modules_for_BSAT/.ipynb_checkpoints/protein_analysis-checkpoin…
grishchenkoira Oct 17, 2023
fe68710
Delete modules_for_BSAT/.ipynb_checkpoints/dna_rna_analysis-checkpoin…
grishchenkoira Oct 17, 2023
9e52d57
Delete modules_for_BSAT/.ipynb_checkpoints/fastq_analysis-checkpoint.py
grishchenkoira Oct 17, 2023
a1d5be1
Rewritten code to add complete protein sequence in select_genes_from_…
grishchenkoira Oct 18, 2023
dca7a00
Rewrite function to generate a FASTA-file
grishchenkoira Oct 18, 2023
966e1ff
Rewrite FASTQ-filtrator module
grishchenkoira Feb 21, 2024
9ab23c8
Remove modules for BSAT
grishchenkoira Feb 21, 2024
5b3a810
Add classes for working with biological sequences
grishchenkoira Feb 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 126 additions & 0 deletions Bio_Files_Processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import os
from typing import List

def convert_multiline_fasta_to_oneline(input_fasta: str, output_fasta: str = None) -> str:
"""
Function conver multiline fasta file into fasta with name line marked by '>' and the other line with sequence.
Results save into new folder "Converted_data". If the folder already exists, new data is written to it.

:param input_fasta: Path to fasta-file with your seqs with.
!! It is necessary to indicate the name along with extensions (.fasta) !!
:type input_fasta: str
:param output_fasta: Name of new fasta-file with your seqs in one line, default value = None
It is necessary to indicate the name along with extensions (.fasta)
:type output_fasta: str
:rtype: str
:return: Script completion message
"""
if input_fasta.find('.fasta') == 0:
raise ValueError(f'Wrong file format in input!')
if os.path.exists(os.path.join('.', 'Converted_data')) == False:
os.mkdir(os.path.join('.', 'Converted_data'))
if output_fasta == None:
out_name = 'one_line_' + input_fasta.strip("/")[-1]
else:
out_name = output_fasta
gene = 0
seq = 0
gene_name = ''
prot_seq = ''
gene_and_seq = dict()
with open (input_fasta) as seq_fasta:
for line in seq_fasta:
if gene == 1:
if line.startswith('>') == False:
gene_and_seq[gene_name] += line.strip('\n')
else:
gene_name = line.strip('\n')
gene_and_seq[gene_name] = ''
if gene == 0 and line.startswith('>'):
gene_name = line.strip('\n')
gene_and_seq[gene_name] = ''
gene = 1
with open (os.path.join('.', 'Converted_data', out_name), mode = 'w') as new_seq_fasta:
for key, item in gene_and_seq.items():
new_seq_fasta.write(key + '\n')
new_seq_fasta.write(item + '\n')
return 'All sequences processed!'


def select_genes_from_gbk_to_fasta(input_gbk: str, genes: List[str], n_before: int = 1, n_after: int = 1,
output_fasta: str = None) -> str:
'''
Function help to search neighbours of GOI (gene of interest). Function writes neighbours of GOI in new FASTA-file
as: name of gene, protein sequence. Results save into new folder "Analyzed_data". If the folder already exists,
new data is written to it.

:input_gbk: Path to gbk-file with your seqs with.
!! It is necessary to indicate the name along with extensions (.gbk) !!
:type input_gbk: str
:param genes: Gene of interest names
:type genes: List[str]
:param n_before: number of genes before GOI (>0), default value = 1
:type n_before: int
:param n_after: number of genes after GOI (>0), default value = 1
:type n_after: int
:output_fasta: Name of FASTA-file with neighbours of GOI (names and seqs), default value = None
:type output_fasta: str
:rtype: str
:return: Script completion message

'''
if input_gbk.find('.gbk') == 0:
raise ValueError(f'Wrong file format in input!')
if os.path.exists(os.path.join('.', 'Analyzed_data')) == False:
os.mkdir(os.path.join('.', 'Analyzed_data'))
genes_for_search = genes
genes_gbk = []
genes_for_search_in_gbk = []
neighbour_genes = dict()
if output_fasta == None:
output_fasta = 'output_for_gbk.fasta'
with open (input_gbk) as gbk:
for line in gbk:
if '/gene' in line:
genes_gbk += [line.strip().split('=')[1]]
for el in genes_for_search:
genes_for_search_in_gbk += [gn for gn in genes_gbk if el in gn]
for gene in genes_for_search_in_gbk:
gene_index = genes_gbk.index(gene)
if gene_index >= 0 and gene_index < (len(genes_gbk) - 1):
for i in range(1, n_before + 1):
neighbour_genes[(genes_gbk[gene_index - i])] = 0
for i in range(1, n_after + 1):
neighbour_genes[(genes_gbk[gene_index + i])] = 0
else:
for i in range(1, n_before + 1):
neighbour_genes[(genes_gbk[gene_index - i])] = 0
for i in range(1, n_after):
neighbour_genes[(genes_gbk[0 + i])] = 0
with open (input_gbk) as gbk:
gene_name_read = 0
protein_read = 0
gene_name = ''
protein = ''
for line in gbk:
if '/gene' in line:
gene_name = line.strip().split('=')[1]
if gene_name in neighbour_genes:
gene_name_read = 1
if protein_read == 1:
protein += line.strip('\n').strip(' ')
if '"' in line:
protein_read = 0
gene_name_read = 0
neighbour_genes[gene_name] = protein
protein = ''
if gene_name_read == 1 and '/translation' in line:
protein_read = 1
protein += line.strip().split('=')[1]
with open (os.path.join('.', 'Analyzed_data', output_fasta), mode = 'w') as fasta:
for name, seq in neighbour_genes.items():
name = '>'+name.replace('\"','')
fasta.write(name + '\n')
fasta.write(seq.replace('\"','') + '\n')
return 'All sequences processed!'

Loading