Coverage for src/python/ensembl/io/genomio/genome_metadata/prepare.py: 87%
75 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
1# See the NOTICE file distributed with this work for additional information
2# regarding copyright ownership.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Expand the genome metadata file adding information about the provider, taxonomy, and assembly and
16gene build versions.
17"""
19__all__ = [
20 "add_provider",
21 "add_assembly_version",
22 "add_genebuild_metadata",
23 "add_species_metadata",
24 "prepare_genome_metadata",
25 "PROVIDER_DATA",
26 "MissingNodeError",
27 "MetadataError",
28]
30import datetime
31from os import PathLike
32from typing import Dict
34import ensembl.io.genomio
35from ensembl.io.genomio.utils import get_json, print_json
36from ensembl.utils.argparse import ArgumentParser
37from ensembl.utils.logging import init_logging_with_args
40PROVIDER_DATA = {
41 "GenBank": {
42 "assembly": {
43 "provider_name": "GenBank",
44 "provider_url": "https://www.ncbi.nlm.nih.gov/datasets/genome",
45 },
46 "annotation": {
47 "provider_name": "GenBank",
48 "provider_url": "https://www.ncbi.nlm.nih.gov/datasets/genome",
49 },
50 },
51 "RefSeq": {
52 "assembly": {
53 "provider_name": "RefSeq",
54 "provider_url": "https://www.ncbi.nlm.nih.gov/datasets/genome",
55 },
56 "annotation": {
57 "provider_name": "RefSeq",
58 "provider_url": "https://www.ncbi.nlm.nih.gov/datasets/genome",
59 },
60 },
61}
64class MissingNodeError(Exception):
65 """When a taxon XML node cannot be found."""
68class MetadataError(Exception):
69 """When a metadata value is not expected."""
72def add_provider(genome_metadata: Dict, ncbi_data: Dict) -> None:
73 """Updates the genome metadata adding provider information for assembly and gene models.
75 Assembly provider metadata will only be added if it is missing, i.e. neither `"provider_name"` or
76 `"provider_url"` are present. The gene model metadata will only be added if `gff3_file` is provided.
78 Args:
79 genome_data: Genome information of assembly, accession and annotation.
80 ncbi_data: Report data from NCBI datasets.
82 Raises:
83 MetadataError: If accession's format in genome metadata does not match with a known provider.
84 """
85 # Get accession provider
86 accession = genome_metadata["assembly"]["accession"]
87 if accession.startswith("GCF"):
88 provider = PROVIDER_DATA["RefSeq"]
89 elif accession.startswith("GCA"):
90 provider = PROVIDER_DATA["GenBank"]
91 else:
92 raise MetadataError(f"Accession does not look like an INSDC or RefSeq accession: {accession}")
94 # Add assembly provider (if missing)
95 assembly = genome_metadata["assembly"]
96 if ("provider_name" not in assembly) and ("provider_url" not in assembly):
97 assembly["provider_name"] = provider["assembly"]["provider_name"]
98 assembly["provider_url"] = f'{provider["assembly"]["provider_url"]}/{accession}'
100 # Add annotation provider if there are gene models
101 if "annotation_info" in ncbi_data:
102 annotation = genome_metadata.setdefault("annotation", {})
103 if ("provider_name" not in annotation) and ("provider_url" not in annotation):
104 annotation["provider_name"] = provider["annotation"]["provider_name"]
105 annotation["provider_url"] = f'{provider["annotation"]["provider_url"]}/{accession}'
108def add_assembly_version(genome_data: Dict) -> None:
109 """Adds version number to the genome's assembly information if one is not present already.
111 Args:
112 genome_data: Genome information of assembly, accession and annotation.
113 """
114 assembly = genome_data["assembly"]
115 if "version" not in assembly:
116 accession = assembly["accession"]
117 version = accession.partition(".")[2]
118 if version:
119 assembly["version"] = int(version)
122def add_genebuild_metadata(genome_data: Dict) -> None:
123 """Adds genebuild metadata to genome information if not present already.
125 The default convention is to use the current date as `"version"` and `"start_date"`.
127 Args:
128 genome_data: Genome information of assembly, accession and annotation.
129 """
130 genebuild = genome_data.setdefault("genebuild", {})
131 current_date = datetime.date.today().isoformat()
132 if "version" not in genebuild:
133 genebuild["version"] = current_date
134 if "start_date" not in genebuild:
135 genebuild["start_date"] = current_date
138def add_species_metadata(genome_metadata: Dict, ncbi_data: Dict) -> None:
139 """Adds taxonomy ID, scientific name and strain (if present) from the NCBI dataset report.
141 Args:
142 genome_metadata: Genome information of assembly, accession and annotation.
143 ncbi_data: Report data from NCBI datasets.
145 """
146 species = genome_metadata.setdefault("species", {})
147 try:
148 organism = ncbi_data["organism"]
149 except KeyError:
150 return
152 if "tax_id" in organism: 152 ↛ 154line 152 didn't jump to line 154 because the condition on line 152 was always true
153 species.setdefault("taxonomy_id", organism["tax_id"])
154 if "organism_name" in organism:
155 species.setdefault("scientific_name", organism["organism_name"])
157 try:
158 species.setdefault("strain", organism["infraspecific_names"]["strain"])
159 except KeyError:
160 pass
163def prepare_genome_metadata(
164 input_file: PathLike,
165 output_file: PathLike,
166 ncbi_meta: PathLike,
167) -> None:
168 """Updates the genome metadata JSON file with additional information.
170 In particular, more information is added about the provider, the assembly and its gene build version,
171 and the taxonomy.
173 Args:
174 input_file: Path to JSON file with genome metadata.
175 output_file: Output directory where to generate the final `genome.json` file.
176 ncbi_meta: JSON file from NCBI datasets.
178 """
179 genome_data = get_json(input_file)
180 ncbi_data = {}
181 if ncbi_meta: 181 ↛ 185line 181 didn't jump to line 185 because the condition on line 181 was always true
182 ncbi_data = get_json(ncbi_meta)["reports"][0]
184 # Amend any missing metadata
185 add_provider(genome_data, ncbi_data)
186 add_assembly_version(genome_data)
187 add_genebuild_metadata(genome_data)
188 add_species_metadata(genome_data, ncbi_data)
189 # Dump updated genome metadata
190 print_json(output_file, genome_data)
193def main() -> None:
194 """Module's entry-point."""
195 parser = ArgumentParser(description=__doc__)
196 parser.add_argument_src_path("--input_file", required=True, help="Genome metadata JSON file")
197 parser.add_argument_dst_path(
198 "--output_file", required=True, help="Output path for the new genome metadata file"
199 )
200 parser.add_argument_src_path(
201 "--ncbi_meta", required=True, help="JSON file from NCBI datasets for this genome."
202 )
203 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
204 parser.add_log_arguments()
205 args = parser.parse_args()
206 init_logging_with_args(args)
208 prepare_genome_metadata(
209 input_file=args.input_file, output_file=args.output_file, ncbi_meta=args.ncbi_meta
210 )