Coverage for src/python/ensembl/io/genomio/gff3/process.py: 21%

35 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-02-21 15:37 +0000

1# See the NOTICE file distributed with this work for additional information 

2# regarding copyright ownership. 

3# 

4# Licensed under the Apache License, Version 2.0 (the "License"); 

5# you may not use this file except in compliance with the License. 

6# You may obtain a copy of the License at 

7# 

8# http://www.apache.org/licenses/LICENSE-2.0 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, 

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

13# See the License for the specific language governing permissions and 

14# limitations under the License. 

15"""Simplify and fix a GFF3 file and returns both a cleaned up GFF3 file and a functional annotation 

16JSON file. 

17""" 

18 

19import logging 

20from pathlib import Path 

21 

22import ensembl.io.genomio 

23from ensembl.utils.argparse import ArgumentParser 

24from ensembl.utils.logging import init_logging_with_args 

25 

26from .simplifier import GFFSimplifier 

27from .gene_merger import GFFGeneMerger 

28 

29 

30def main() -> None: 

31 """Main script entry-point.""" 

32 parser = ArgumentParser( 

33 description=( 

34 "Standardize the gene model representation of a GFF3 file, and extract the functional " 

35 "annotation in a separate file." 

36 ) 

37 ) 

38 parser.add_argument_src_path("--in_gff_path", required=True, help="Input GFF3 file") 

39 parser.add_argument_src_path("--genome_data", required=True, help="Genome JSON file") 

40 parser.add_argument( 

41 "--fail_missing_stable_ids", action="store_true", help="Do not generate IDs when missing/invalid" 

42 ) 

43 parser.add_argument_dst_path("--out_gff_path", default=Path("gene_models.gff3"), help="Output GFF3 file") 

44 parser.add_argument_dst_path( 

45 "--out_func_path", 

46 default=Path("functional_annotation.json"), 

47 help="Output functional annotation JSON file", 

48 ) 

49 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__) 

50 parser.add_log_arguments(add_log_file=True) 

51 args = parser.parse_args() 

52 init_logging_with_args(args) 

53 

54 # Merge multiline gene features in a separate file 

55 logging.info("Checking for genes to merge...") 

56 interim_gff_path = Path(f"{args.in_gff_path}_INTERIM_MERGE") 

57 merger = GFFGeneMerger() 

58 merged_genes = merger.merge(args.in_gff_path, interim_gff_path) 

59 num_merged_genes = len(merged_genes) 

60 in_gff_path = args.in_gff_path 

61 # If there are split genes, decide to merge, or just die 

62 if num_merged_genes > 0: 

63 # Report the list of merged genes in case something does not look right 

64 logging.info(f"{num_merged_genes} genes merged") 

65 logging.debug("\n".join(merged_genes)) 

66 # Use the GFF with the merged genes for the next part 

67 in_gff_path = interim_gff_path 

68 

69 # Load GFF3 data and write a simpler version that follows our specifications as well as a 

70 # functional annotation JSON file 

71 logging.info("Simplify and fix GFF3") 

72 gff_data = GFFSimplifier(args.genome_data) 

73 if args.fail_missing_stable_ids: 

74 gff_data.stable_ids.make_missing_stable_ids = False 

75 gff_data.simpler_gff3(in_gff_path) 

76 gff_data.records.to_gff(args.out_gff_path) 

77 gff_data.annotations.to_json(args.out_func_path) 

78 

79 

80if __name__ == "__main__": 

81 main()