From 8eabf4010c5ee9816338c46be24c8ab826025718 Mon Sep 17 00:00:00 2001 From: shradhasaraf Date: Thu, 10 Jul 2025 15:46:53 +0100 Subject: [PATCH 1/3] Added script for gene merging --- merge_genes.py | 234 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 234 insertions(+) create mode 100644 merge_genes.py diff --git a/merge_genes.py b/merge_genes.py new file mode 100644 index 000000000..896436463 --- /dev/null +++ b/merge_genes.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Merge Genes script. + +This script aims to merge the genes from Nipponbare 3 annotation sources (RAPDB, MSU and Gramene). + +Below are some of the criteria to consider: +1. (plus) and (minus) strand genes are considered separately +2. The genes are considered as overlapping if the overlap length of the sequence is >=50% of the length of the smallest line. + +""" +import argparse +import logging +from pathlib import Path + +import pandas + +from ensembl.utils import StrPath +from ensembl.utils.argparse import ArgumentParser +from ensembl.utils.logging import init_logging_with_args + + +def update_gene_info( + update_gene: dict, + id_list: list, + merged_chromosome: str, + merged_gene_start: str, + merged_gene_end: str, + merged_strand: str, +) -> dict: + """ + Takes the id_list, other parameters from main, update the gene information and returns to the main + + Args: + update_gene (dict): column names as keys and corresponding parameters are values in teh form of list + id_list (list): A list of ids from genes to merge + merged_chromosome: Chromosome number from the merged gene + merged_gene_start: New gene start coordinate after merging the gene + merged_gene_end: New gene end coordinate after merging the gene + merged_strand: Strand from the merged genes + + Returns: + The updated gene. + """ + update_gene["chr"].append(merged_chromosome) + update_gene["source"].append("panoryza") + update_gene["gene"].append("gene") + update_gene["start"].append(merged_gene_start) + update_gene["end"].append(merged_gene_end) + update_gene["score"].append(".") + update_gene["strand"].append(merged_strand) + update_gene["frame"].append(".") + merge_ids = ",".join(id_list) + update_gene["attribute"].append(merge_ids) + return update_gene + + +def sort_genes(output_file: dict) -> pandas.DataFrame: + """Sort genes. + + Args: + output_file: gene information in a dictionary + Returns: + A sorted gene matrix. + + """ + gene_matrix = pandas.DataFrame.from_dict(output_file) + gene_matrix["chr_no"] = gene_matrix["chr"].str.extract("(\d+)", expand=False).astype(int) + gene_matrix.sort_values(by=["chr_no", "start"], inplace=True) + gene_matrix.drop("chr_no", axis=1, inplace=True) + return gene_matrix + + +def merge_genes(input_gff_file: StrPath) -> dict: + """Merge genes from the input GFF file. + + Args: + input_gff_file: Path to the GFF file. + + Returns: + The merged genes in a dictionary. + """ + + list_id = [] + dict_merged_gene = { + "chr": [], + "source": [], + "gene": [], + "start": [], + "end": [], + "score": [], + "strand": [], + "frame": [], + "attribute": [], + } + + logging.info("The input file is " + str(input_gff_file)) + + with open(input_gff_file) as gff_file: + lines = gff_file.readlines() + ## if line is starting with 'Chr' then split it and assign to different variables + for line in lines: + if line.startswith("#"): + continue + chromo, _, _, start, end, _, strand, _, description = line.split("\t") + gene_id = description.split(";Name=")[0].split(";biotype=")[0] + start = int(start) + end = int(end) + + ### Assign the values for the first gene in the gff + if line == lines[0]: + gene_start = start + gene_end = end + current_chromosome = str(line.split("\t")[0]) + current_strand = str(line.split("\t")[6]) + + ## Checking if the strand and chromosome are same between the previous and current line; otherwise print and start with the new gene + if strand != current_strand or chromo != current_chromosome: + dict_merged_gene = update_gene_info( + dict_merged_gene, list_id, current_chromosome, gene_start, gene_end, current_strand + ) + current_strand = strand + gene_start = start + gene_end = end + current_chromosome = chromo + list_id = [] + + ## if start of the next gene is lesser than the current gene, raise exception + if start < gene_start: + raise Exception("Error: File is not sorted properly") + + ## if start coordinate of a gene is equal or greater than the previous AND if start coordinate is lesser than the gene end then start merging the genes + elif (start == gene_start or start > gene_start) and start < gene_end: + ## if end coord of current gene is lesser than the previous, then merge the gene into the previous one + if end <= gene_end: + list_id.append(gene_id) + + ## if end coord of current gene is greater than the previous, then calculate the length of both genes and check overlap criteria + elif end > gene_end: + gene_length = gene_end - gene_start + length = end - start + min_length = min(length, gene_length) + overlap_length = gene_end - start + + ## Checking the overlap criteria + if int(overlap_length) * 2 >= int(min_length): + gene_end = end + list_id.append(gene_id) + logging.info("overlap\n") + else: + dict_merged_gene = update_gene_info( + dict_merged_gene, + list_id, + current_chromosome, + gene_start, + gene_end, + current_strand, + ) + list_id = [] + gene_start = start + gene_end = end + list_id.append(gene_id) + + ## if start coord of a gene is greater than the end coord of the previous gene then print the previous gene (current gene will not merge into that) + if start >= gene_end: + dict_merged_gene = update_gene_info( + dict_merged_gene, list_id, current_chromosome, gene_start, gene_end, current_strand + ) + list_id = [] + gene_start = start + gene_end = end + list_id.append(gene_id) + + ## If last line in the file, print the gene(s) + if line == lines[-1]: + dict_merged_gene = update_gene_info( + dict_merged_gene, list_id, current_chromosome, gene_start, gene_end, current_strand + ) + return dict_merged_gene + + +def parse_args(arg_list: list[str] | None) -> argparse.Namespace: + """Parse the aruments from the command line. + + Args: + arg_list: List of arguments to parse. If `None`, grab them from the command line. + + Returns: + Parsed arguments. + + """ + parser = ArgumentParser(description="Merge genes script") + parser.add_argument_src_path( + "--input_gff_file", + default="sorted_all_line.gff", + help="Input file with gene info from RAPDB, MSU and Gramene", + ) + parser.add_argument_dst_path( + "--output_dir", default=".", help="Output directory for the merged genes file" + ) + parser.add_log_arguments() + return parser.parse_args(arg_list) + + +def main(arg_list: list[str] | None = None) -> None: + """Main script entry point. + + Args: + arg_list: Arguments to parse passing list to `parse_args()`. + """ + args = parse_args(arg_list) + init_logging_with_args(args) + + merge_genes_metadata = merge_genes(args.input_gff_file) + sorted_genes = sort_genes(merge_genes_metadata) + sorted_genes.to_csv(args.output_dir / "merged_genes.tsv", sep="\t", index=False) + + +if __name__ == "__main__": + main() From 8cd5b5cf084087cc79998d0b3f605024e32fd168 Mon Sep 17 00:00:00 2001 From: shradhasaraf Date: Thu, 10 Jul 2025 15:53:36 +0100 Subject: [PATCH 2/3] Added script for gene merging --- scripts/merge_genes.py | 234 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 234 insertions(+) create mode 100644 scripts/merge_genes.py diff --git a/scripts/merge_genes.py b/scripts/merge_genes.py new file mode 100644 index 000000000..896436463 --- /dev/null +++ b/scripts/merge_genes.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Merge Genes script. + +This script aims to merge the genes from Nipponbare 3 annotation sources (RAPDB, MSU and Gramene). + +Below are some of the criteria to consider: +1. (plus) and (minus) strand genes are considered separately +2. The genes are considered as overlapping if the overlap length of the sequence is >=50% of the length of the smallest line. + +""" +import argparse +import logging +from pathlib import Path + +import pandas + +from ensembl.utils import StrPath +from ensembl.utils.argparse import ArgumentParser +from ensembl.utils.logging import init_logging_with_args + + +def update_gene_info( + update_gene: dict, + id_list: list, + merged_chromosome: str, + merged_gene_start: str, + merged_gene_end: str, + merged_strand: str, +) -> dict: + """ + Takes the id_list, other parameters from main, update the gene information and returns to the main + + Args: + update_gene (dict): column names as keys and corresponding parameters are values in teh form of list + id_list (list): A list of ids from genes to merge + merged_chromosome: Chromosome number from the merged gene + merged_gene_start: New gene start coordinate after merging the gene + merged_gene_end: New gene end coordinate after merging the gene + merged_strand: Strand from the merged genes + + Returns: + The updated gene. + """ + update_gene["chr"].append(merged_chromosome) + update_gene["source"].append("panoryza") + update_gene["gene"].append("gene") + update_gene["start"].append(merged_gene_start) + update_gene["end"].append(merged_gene_end) + update_gene["score"].append(".") + update_gene["strand"].append(merged_strand) + update_gene["frame"].append(".") + merge_ids = ",".join(id_list) + update_gene["attribute"].append(merge_ids) + return update_gene + + +def sort_genes(output_file: dict) -> pandas.DataFrame: + """Sort genes. + + Args: + output_file: gene information in a dictionary + Returns: + A sorted gene matrix. + + """ + gene_matrix = pandas.DataFrame.from_dict(output_file) + gene_matrix["chr_no"] = gene_matrix["chr"].str.extract("(\d+)", expand=False).astype(int) + gene_matrix.sort_values(by=["chr_no", "start"], inplace=True) + gene_matrix.drop("chr_no", axis=1, inplace=True) + return gene_matrix + + +def merge_genes(input_gff_file: StrPath) -> dict: + """Merge genes from the input GFF file. + + Args: + input_gff_file: Path to the GFF file. + + Returns: + The merged genes in a dictionary. + """ + + list_id = [] + dict_merged_gene = { + "chr": [], + "source": [], + "gene": [], + "start": [], + "end": [], + "score": [], + "strand": [], + "frame": [], + "attribute": [], + } + + logging.info("The input file is " + str(input_gff_file)) + + with open(input_gff_file) as gff_file: + lines = gff_file.readlines() + ## if line is starting with 'Chr' then split it and assign to different variables + for line in lines: + if line.startswith("#"): + continue + chromo, _, _, start, end, _, strand, _, description = line.split("\t") + gene_id = description.split(";Name=")[0].split(";biotype=")[0] + start = int(start) + end = int(end) + + ### Assign the values for the first gene in the gff + if line == lines[0]: + gene_start = start + gene_end = end + current_chromosome = str(line.split("\t")[0]) + current_strand = str(line.split("\t")[6]) + + ## Checking if the strand and chromosome are same between the previous and current line; otherwise print and start with the new gene + if strand != current_strand or chromo != current_chromosome: + dict_merged_gene = update_gene_info( + dict_merged_gene, list_id, current_chromosome, gene_start, gene_end, current_strand + ) + current_strand = strand + gene_start = start + gene_end = end + current_chromosome = chromo + list_id = [] + + ## if start of the next gene is lesser than the current gene, raise exception + if start < gene_start: + raise Exception("Error: File is not sorted properly") + + ## if start coordinate of a gene is equal or greater than the previous AND if start coordinate is lesser than the gene end then start merging the genes + elif (start == gene_start or start > gene_start) and start < gene_end: + ## if end coord of current gene is lesser than the previous, then merge the gene into the previous one + if end <= gene_end: + list_id.append(gene_id) + + ## if end coord of current gene is greater than the previous, then calculate the length of both genes and check overlap criteria + elif end > gene_end: + gene_length = gene_end - gene_start + length = end - start + min_length = min(length, gene_length) + overlap_length = gene_end - start + + ## Checking the overlap criteria + if int(overlap_length) * 2 >= int(min_length): + gene_end = end + list_id.append(gene_id) + logging.info("overlap\n") + else: + dict_merged_gene = update_gene_info( + dict_merged_gene, + list_id, + current_chromosome, + gene_start, + gene_end, + current_strand, + ) + list_id = [] + gene_start = start + gene_end = end + list_id.append(gene_id) + + ## if start coord of a gene is greater than the end coord of the previous gene then print the previous gene (current gene will not merge into that) + if start >= gene_end: + dict_merged_gene = update_gene_info( + dict_merged_gene, list_id, current_chromosome, gene_start, gene_end, current_strand + ) + list_id = [] + gene_start = start + gene_end = end + list_id.append(gene_id) + + ## If last line in the file, print the gene(s) + if line == lines[-1]: + dict_merged_gene = update_gene_info( + dict_merged_gene, list_id, current_chromosome, gene_start, gene_end, current_strand + ) + return dict_merged_gene + + +def parse_args(arg_list: list[str] | None) -> argparse.Namespace: + """Parse the aruments from the command line. + + Args: + arg_list: List of arguments to parse. If `None`, grab them from the command line. + + Returns: + Parsed arguments. + + """ + parser = ArgumentParser(description="Merge genes script") + parser.add_argument_src_path( + "--input_gff_file", + default="sorted_all_line.gff", + help="Input file with gene info from RAPDB, MSU and Gramene", + ) + parser.add_argument_dst_path( + "--output_dir", default=".", help="Output directory for the merged genes file" + ) + parser.add_log_arguments() + return parser.parse_args(arg_list) + + +def main(arg_list: list[str] | None = None) -> None: + """Main script entry point. + + Args: + arg_list: Arguments to parse passing list to `parse_args()`. + """ + args = parse_args(arg_list) + init_logging_with_args(args) + + merge_genes_metadata = merge_genes(args.input_gff_file) + sorted_genes = sort_genes(merge_genes_metadata) + sorted_genes.to_csv(args.output_dir / "merged_genes.tsv", sep="\t", index=False) + + +if __name__ == "__main__": + main() From fc96508a114fcdb8c784c1d80d85c6927296e621 Mon Sep 17 00:00:00 2001 From: shradhasaraf Date: Fri, 11 Jul 2025 09:57:13 +0100 Subject: [PATCH 3/3] removed script from parent folder --- merge_genes.py | 234 ------------------------------------------------- 1 file changed, 234 deletions(-) delete mode 100644 merge_genes.py diff --git a/merge_genes.py b/merge_genes.py deleted file mode 100644 index 896436463..000000000 --- a/merge_genes.py +++ /dev/null @@ -1,234 +0,0 @@ -#!/usr/bin/env python -# See the NOTICE file distributed with this work for additional information -# regarding copyright ownership. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Merge Genes script. - -This script aims to merge the genes from Nipponbare 3 annotation sources (RAPDB, MSU and Gramene). - -Below are some of the criteria to consider: -1. (plus) and (minus) strand genes are considered separately -2. The genes are considered as overlapping if the overlap length of the sequence is >=50% of the length of the smallest line. - -""" -import argparse -import logging -from pathlib import Path - -import pandas - -from ensembl.utils import StrPath -from ensembl.utils.argparse import ArgumentParser -from ensembl.utils.logging import init_logging_with_args - - -def update_gene_info( - update_gene: dict, - id_list: list, - merged_chromosome: str, - merged_gene_start: str, - merged_gene_end: str, - merged_strand: str, -) -> dict: - """ - Takes the id_list, other parameters from main, update the gene information and returns to the main - - Args: - update_gene (dict): column names as keys and corresponding parameters are values in teh form of list - id_list (list): A list of ids from genes to merge - merged_chromosome: Chromosome number from the merged gene - merged_gene_start: New gene start coordinate after merging the gene - merged_gene_end: New gene end coordinate after merging the gene - merged_strand: Strand from the merged genes - - Returns: - The updated gene. - """ - update_gene["chr"].append(merged_chromosome) - update_gene["source"].append("panoryza") - update_gene["gene"].append("gene") - update_gene["start"].append(merged_gene_start) - update_gene["end"].append(merged_gene_end) - update_gene["score"].append(".") - update_gene["strand"].append(merged_strand) - update_gene["frame"].append(".") - merge_ids = ",".join(id_list) - update_gene["attribute"].append(merge_ids) - return update_gene - - -def sort_genes(output_file: dict) -> pandas.DataFrame: - """Sort genes. - - Args: - output_file: gene information in a dictionary - Returns: - A sorted gene matrix. - - """ - gene_matrix = pandas.DataFrame.from_dict(output_file) - gene_matrix["chr_no"] = gene_matrix["chr"].str.extract("(\d+)", expand=False).astype(int) - gene_matrix.sort_values(by=["chr_no", "start"], inplace=True) - gene_matrix.drop("chr_no", axis=1, inplace=True) - return gene_matrix - - -def merge_genes(input_gff_file: StrPath) -> dict: - """Merge genes from the input GFF file. - - Args: - input_gff_file: Path to the GFF file. - - Returns: - The merged genes in a dictionary. - """ - - list_id = [] - dict_merged_gene = { - "chr": [], - "source": [], - "gene": [], - "start": [], - "end": [], - "score": [], - "strand": [], - "frame": [], - "attribute": [], - } - - logging.info("The input file is " + str(input_gff_file)) - - with open(input_gff_file) as gff_file: - lines = gff_file.readlines() - ## if line is starting with 'Chr' then split it and assign to different variables - for line in lines: - if line.startswith("#"): - continue - chromo, _, _, start, end, _, strand, _, description = line.split("\t") - gene_id = description.split(";Name=")[0].split(";biotype=")[0] - start = int(start) - end = int(end) - - ### Assign the values for the first gene in the gff - if line == lines[0]: - gene_start = start - gene_end = end - current_chromosome = str(line.split("\t")[0]) - current_strand = str(line.split("\t")[6]) - - ## Checking if the strand and chromosome are same between the previous and current line; otherwise print and start with the new gene - if strand != current_strand or chromo != current_chromosome: - dict_merged_gene = update_gene_info( - dict_merged_gene, list_id, current_chromosome, gene_start, gene_end, current_strand - ) - current_strand = strand - gene_start = start - gene_end = end - current_chromosome = chromo - list_id = [] - - ## if start of the next gene is lesser than the current gene, raise exception - if start < gene_start: - raise Exception("Error: File is not sorted properly") - - ## if start coordinate of a gene is equal or greater than the previous AND if start coordinate is lesser than the gene end then start merging the genes - elif (start == gene_start or start > gene_start) and start < gene_end: - ## if end coord of current gene is lesser than the previous, then merge the gene into the previous one - if end <= gene_end: - list_id.append(gene_id) - - ## if end coord of current gene is greater than the previous, then calculate the length of both genes and check overlap criteria - elif end > gene_end: - gene_length = gene_end - gene_start - length = end - start - min_length = min(length, gene_length) - overlap_length = gene_end - start - - ## Checking the overlap criteria - if int(overlap_length) * 2 >= int(min_length): - gene_end = end - list_id.append(gene_id) - logging.info("overlap\n") - else: - dict_merged_gene = update_gene_info( - dict_merged_gene, - list_id, - current_chromosome, - gene_start, - gene_end, - current_strand, - ) - list_id = [] - gene_start = start - gene_end = end - list_id.append(gene_id) - - ## if start coord of a gene is greater than the end coord of the previous gene then print the previous gene (current gene will not merge into that) - if start >= gene_end: - dict_merged_gene = update_gene_info( - dict_merged_gene, list_id, current_chromosome, gene_start, gene_end, current_strand - ) - list_id = [] - gene_start = start - gene_end = end - list_id.append(gene_id) - - ## If last line in the file, print the gene(s) - if line == lines[-1]: - dict_merged_gene = update_gene_info( - dict_merged_gene, list_id, current_chromosome, gene_start, gene_end, current_strand - ) - return dict_merged_gene - - -def parse_args(arg_list: list[str] | None) -> argparse.Namespace: - """Parse the aruments from the command line. - - Args: - arg_list: List of arguments to parse. If `None`, grab them from the command line. - - Returns: - Parsed arguments. - - """ - parser = ArgumentParser(description="Merge genes script") - parser.add_argument_src_path( - "--input_gff_file", - default="sorted_all_line.gff", - help="Input file with gene info from RAPDB, MSU and Gramene", - ) - parser.add_argument_dst_path( - "--output_dir", default=".", help="Output directory for the merged genes file" - ) - parser.add_log_arguments() - return parser.parse_args(arg_list) - - -def main(arg_list: list[str] | None = None) -> None: - """Main script entry point. - - Args: - arg_list: Arguments to parse passing list to `parse_args()`. - """ - args = parse_args(arg_list) - init_logging_with_args(args) - - merge_genes_metadata = merge_genes(args.input_gff_file) - sorted_genes = sort_genes(merge_genes_metadata) - sorted_genes.to_csv(args.output_dir / "merged_genes.tsv", sep="\t", index=False) - - -if __name__ == "__main__": - main()