Coverage for src/python/ensembl/io/genomio/seq_region/prepare.py: 68%
33 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
1# See the NOTICE file distributed with this work for additional information
2# regarding copyright ownership.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Construct a seq_region metadata file from INSDC files."""
17from pathlib import Path
19import ensembl.io.genomio
20from ensembl.io.genomio.utils import get_json, print_json
21from ensembl.io.genomio.seq_region.collection import SeqCollection
22from ensembl.utils.argparse import ArgumentParser
23from ensembl.utils.logging import init_logging_with_args
24from ensembl.utils import StrPath
27def prepare_seq_region_metadata(
28 genome_file: StrPath,
29 report_file: StrPath,
30 dst_file: StrPath,
31 *,
32 gbff_file: StrPath | None = None,
33 to_exclude: list[str] | None = None,
34 mock_run: bool = False,
35) -> None:
36 """Prepares the sequence region metadata found in the INSDC/RefSeq report and GBFF files.
38 The sequence region information is loaded from both sources and combined. Elements are added/excluded
39 as requested, and the final sequence region metadata is dumped in a JSON file that follows the schema
40 defined in "src/python/ensembl/io/genomio/data/schemas/seq_region.json".
42 Args:
43 genome_file: Genome metadata JSON file path.
44 report_file: INSDC/RefSeq sequences report file path to parse.
45 gbff_file: INSDC/RefSeq GBFF file path to parse.
46 dst_file: JSON file output for the processed sequence regions JSON.
47 to_exclude: Sequence region names to exclude.
48 mock_run: Do not call external taxonomy service.
50 """
51 genome_data = get_json(genome_file)
52 dst_file = Path(dst_file)
53 is_refseq = genome_data["assembly"]["accession"].startswith("GCF_")
55 seqs = SeqCollection(mock=mock_run)
56 seqs.from_report(Path(report_file), is_refseq)
57 if gbff_file:
58 seqs.from_gbff(Path(gbff_file))
60 # Exclude seq_regions from a list
61 if to_exclude:
62 seqs.remove(to_exclude)
64 # Add translation and mitochondrial codon tables
65 seqs.add_translation_table()
66 seqs.add_mitochondrial_codon_table(genome_data["species"]["taxonomy_id"])
68 # Print out the file
69 print_json(dst_file, seqs.to_list())
72def main() -> None:
73 """Module's entry-point."""
74 parser = ArgumentParser(description="Construct a sequence region metadata file from INSDC files.")
75 parser.add_argument_src_path("--genome_file", required=True, help="Genome metadata JSON file")
76 parser.add_argument_src_path(
77 "--report_file", required=True, help="INSDC/RefSeq sequences report file to parse"
78 )
79 parser.add_argument_src_path("--gbff_file", help="INSDC/RefSeq GBFF file to parse")
80 parser.add_argument_dst_path(
81 "--dst_file", default="seq_region.json", help="Output JSON file for the processed sequence regions"
82 )
83 parser.add_argument(
84 "--to_exclude", nargs="*", metavar="SEQ_REGION_NAME", help="Sequence region names to exclude"
85 )
86 parser.add_argument("--mock_run", action="store_true", help="Do not call external APIs")
87 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
88 parser.add_log_arguments()
89 args = parser.parse_args()
90 init_logging_with_args(args)
92 prepare_seq_region_metadata(
93 genome_file=args.genome_file,
94 report_file=args.report_file,
95 dst_file=args.dst_file,
96 gbff_file=args.gbff_file,
97 to_exclude=args.to_exclude,
98 mock_run=args.mock_run,
99 )