Coverage for src/python/ensembl/io/genomio/gff3/process.py: 21%
35 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
1# See the NOTICE file distributed with this work for additional information
2# regarding copyright ownership.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Simplify and fix a GFF3 file and returns both a cleaned up GFF3 file and a functional annotation
16JSON file.
17"""
19import logging
20from pathlib import Path
22import ensembl.io.genomio
23from ensembl.utils.argparse import ArgumentParser
24from ensembl.utils.logging import init_logging_with_args
26from .simplifier import GFFSimplifier
27from .gene_merger import GFFGeneMerger
30def main() -> None:
31 """Main script entry-point."""
32 parser = ArgumentParser(
33 description=(
34 "Standardize the gene model representation of a GFF3 file, and extract the functional "
35 "annotation in a separate file."
36 )
37 )
38 parser.add_argument_src_path("--in_gff_path", required=True, help="Input GFF3 file")
39 parser.add_argument_src_path("--genome_data", required=True, help="Genome JSON file")
40 parser.add_argument(
41 "--fail_missing_stable_ids", action="store_true", help="Do not generate IDs when missing/invalid"
42 )
43 parser.add_argument_dst_path("--out_gff_path", default=Path("gene_models.gff3"), help="Output GFF3 file")
44 parser.add_argument_dst_path(
45 "--out_func_path",
46 default=Path("functional_annotation.json"),
47 help="Output functional annotation JSON file",
48 )
49 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
50 parser.add_log_arguments(add_log_file=True)
51 args = parser.parse_args()
52 init_logging_with_args(args)
54 # Merge multiline gene features in a separate file
55 logging.info("Checking for genes to merge...")
56 interim_gff_path = Path(f"{args.in_gff_path}_INTERIM_MERGE")
57 merger = GFFGeneMerger()
58 merged_genes = merger.merge(args.in_gff_path, interim_gff_path)
59 num_merged_genes = len(merged_genes)
60 in_gff_path = args.in_gff_path
61 # If there are split genes, decide to merge, or just die
62 if num_merged_genes > 0:
63 # Report the list of merged genes in case something does not look right
64 logging.info(f"{num_merged_genes} genes merged")
65 logging.debug("\n".join(merged_genes))
66 # Use the GFF with the merged genes for the next part
67 in_gff_path = interim_gff_path
69 # Load GFF3 data and write a simpler version that follows our specifications as well as a
70 # functional annotation JSON file
71 logging.info("Simplify and fix GFF3")
72 gff_data = GFFSimplifier(args.genome_data)
73 if args.fail_missing_stable_ids:
74 gff_data.stable_ids.make_missing_stable_ids = False
75 gff_data.simpler_gff3(in_gff_path)
76 gff_data.records.to_gff(args.out_gff_path)
77 gff_data.annotations.to_json(args.out_func_path)
80if __name__ == "__main__":
81 main()