Coverage for src/python/ensembl/io/genomio/seq_region/prepare.py: 68%

33 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-02-21 15:37 +0000

1# See the NOTICE file distributed with this work for additional information 

2# regarding copyright ownership. 

3# 

4# Licensed under the Apache License, Version 2.0 (the "License"); 

5# you may not use this file except in compliance with the License. 

6# You may obtain a copy of the License at 

7# 

8# http://www.apache.org/licenses/LICENSE-2.0 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, 

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

13# See the License for the specific language governing permissions and 

14# limitations under the License. 

15"""Construct a seq_region metadata file from INSDC files.""" 

16 

17from pathlib import Path 

18 

19import ensembl.io.genomio 

20from ensembl.io.genomio.utils import get_json, print_json 

21from ensembl.io.genomio.seq_region.collection import SeqCollection 

22from ensembl.utils.argparse import ArgumentParser 

23from ensembl.utils.logging import init_logging_with_args 

24from ensembl.utils import StrPath 

25 

26 

27def prepare_seq_region_metadata( 

28 genome_file: StrPath, 

29 report_file: StrPath, 

30 dst_file: StrPath, 

31 *, 

32 gbff_file: StrPath | None = None, 

33 to_exclude: list[str] | None = None, 

34 mock_run: bool = False, 

35) -> None: 

36 """Prepares the sequence region metadata found in the INSDC/RefSeq report and GBFF files. 

37 

38 The sequence region information is loaded from both sources and combined. Elements are added/excluded 

39 as requested, and the final sequence region metadata is dumped in a JSON file that follows the schema 

40 defined in "src/python/ensembl/io/genomio/data/schemas/seq_region.json". 

41 

42 Args: 

43 genome_file: Genome metadata JSON file path. 

44 report_file: INSDC/RefSeq sequences report file path to parse. 

45 gbff_file: INSDC/RefSeq GBFF file path to parse. 

46 dst_file: JSON file output for the processed sequence regions JSON. 

47 to_exclude: Sequence region names to exclude. 

48 mock_run: Do not call external taxonomy service. 

49 

50 """ 

51 genome_data = get_json(genome_file) 

52 dst_file = Path(dst_file) 

53 is_refseq = genome_data["assembly"]["accession"].startswith("GCF_") 

54 

55 seqs = SeqCollection(mock=mock_run) 

56 seqs.from_report(Path(report_file), is_refseq) 

57 if gbff_file: 

58 seqs.from_gbff(Path(gbff_file)) 

59 

60 # Exclude seq_regions from a list 

61 if to_exclude: 

62 seqs.remove(to_exclude) 

63 

64 # Add translation and mitochondrial codon tables 

65 seqs.add_translation_table() 

66 seqs.add_mitochondrial_codon_table(genome_data["species"]["taxonomy_id"]) 

67 

68 # Print out the file 

69 print_json(dst_file, seqs.to_list()) 

70 

71 

72def main() -> None: 

73 """Module's entry-point.""" 

74 parser = ArgumentParser(description="Construct a sequence region metadata file from INSDC files.") 

75 parser.add_argument_src_path("--genome_file", required=True, help="Genome metadata JSON file") 

76 parser.add_argument_src_path( 

77 "--report_file", required=True, help="INSDC/RefSeq sequences report file to parse" 

78 ) 

79 parser.add_argument_src_path("--gbff_file", help="INSDC/RefSeq GBFF file to parse") 

80 parser.add_argument_dst_path( 

81 "--dst_file", default="seq_region.json", help="Output JSON file for the processed sequence regions" 

82 ) 

83 parser.add_argument( 

84 "--to_exclude", nargs="*", metavar="SEQ_REGION_NAME", help="Sequence region names to exclude" 

85 ) 

86 parser.add_argument("--mock_run", action="store_true", help="Do not call external APIs") 

87 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__) 

88 parser.add_log_arguments() 

89 args = parser.parse_args() 

90 init_logging_with_args(args) 

91 

92 prepare_seq_region_metadata( 

93 genome_file=args.genome_file, 

94 report_file=args.report_file, 

95 dst_file=args.dst_file, 

96 gbff_file=args.gbff_file, 

97 to_exclude=args.to_exclude, 

98 mock_run=args.mock_run, 

99 )