Coverage for src/python/ensembl/io/genomio/genome_metadata/dump.py: 86%
132 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
1# See the NOTICE file distributed with this work for additional information
2# regarding copyright ownership.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Generates a JSON file representing the genome metadata from a core database."""
17__all__ = [
18 "get_genome_metadata",
19 "filter_genome_meta",
20 "check_assembly_version",
21 "check_genebuild_version",
22 "metadata_dump_setup",
23]
25import argparse
26import json
27from typing import Any, Type
28import logging
29from pydoc import locate
31from sqlalchemy import select
32from sqlalchemy.orm import Session
33from sqlalchemy.engine import URL
35from ensembl.core.models import Meta
36import ensembl.io.genomio
37from ensembl.io.genomio.utils.json_utils import get_json
38from ensembl.io.genomio.database import DBConnectionLite
39from ensembl.utils.argparse import ArgumentParser
40from ensembl.utils import StrPath
41from ensembl.utils.logging import init_logging_with_args
44DEFAULT_FILTER: dict[str, dict[str, Type]] = {
45 "database": {"name": str},
46 "added_seq": {"region_name": str},
47 "annotation": {"provider_name": str, "provider_url": str},
48 "assembly": {
49 "accession": str,
50 "date": str,
51 "name": str,
52 "provider_name": str,
53 "provider_url": str,
54 "version": int,
55 },
56 "BRC4": {"organism_abbrev": str, "component": str},
57 "genebuild": {"id": str, "method": str, "method_display": str, "start_date": str, "version": str},
58 "species": {
59 "alias": str,
60 "annotation_source": str,
61 "display_name": str,
62 "division": str,
63 "production_name": str,
64 "scientific_name": str,
65 "strain": str,
66 "taxonomy_id": int,
67 },
68}
71def get_genome_metadata(session: Session, db_name: str | None) -> dict[str, Any]:
72 """Returns the meta table content from the core database in a nested dictionary.
74 Args:
75 session: Session for the current core.
76 db_name: Target database name
77 """
78 genome_metadata: dict[str, Any] = {}
80 meta_statement = select(Meta)
81 for row in session.execute(meta_statement).unique().all():
82 meta_key = row[0].meta_key
83 meta_value = row[0].meta_value
84 (main_key, _, subkey) = meta_key.partition(".")
85 # Use empty string as subkey when no "." found to simplify dictionary creation
86 if main_key in genome_metadata:
87 if subkey in genome_metadata[main_key]:
88 genome_metadata[main_key][subkey].append(meta_value)
89 else:
90 genome_metadata[main_key][subkey] = [meta_value]
91 else:
92 genome_metadata[main_key] = {subkey: [meta_value]}
94 if db_name:
95 genome_metadata["database"] = {"name": f"{db_name}"}
97 # Parse genome metadata to simplify dictionary and check data consistency
98 for main_key, subkeys_dict in genome_metadata.items():
99 # Replace single-value lists by the value itself
100 for subkey, value in subkeys_dict.items():
101 if len(value) == 1:
102 subkeys_dict[subkey] = value[0]
103 # Remove nested dictionary if it only has "" as key, passing its value to the main key
104 if "" in subkeys_dict:
105 if len(subkeys_dict) == 1:
106 genome_metadata[main_key] = subkeys_dict.pop("")
107 else:
108 raise ValueError(f"Unexpected meta keys for '{main_key}': {', '.join(subkeys_dict.keys())}")
109 return genome_metadata
112def filter_genome_meta(
113 genome_metadata: dict[str, Any], metafilter: dict | None, meta_update: bool
114) -> dict[str, Any]:
115 """Returns a filtered metadata dictionary with only the predefined keys in METADATA_FILTER.
117 Also converts to expected data types (to follow the genome JSON schema).
119 Args:
120 genome_metadata: Nested metadata key values from the core metadata table.
121 metafilter: Input JSON containing subset of meta table values to filter on.
122 meta_update: Deactivates additional meta updating.
124 """
125 filtered_metadata: dict[str, Any] = {}
127 if metafilter:
128 metadata_filter: dict[str, dict[str, type]] = metafilter
129 else:
130 metadata_filter = DEFAULT_FILTER
132 for key, subfilter in metadata_filter.items():
133 if key in genome_metadata:
134 filtered_metadata[key] = {}
135 for subkey, value_type in subfilter.items():
136 if isinstance(value_type, str):
137 value_type = type(value_type)
138 if isinstance(value_type, int): 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true
139 value_type = type(value_type)
140 if subkey in genome_metadata[key]:
141 value = genome_metadata[key][subkey]
142 if isinstance(value, list):
143 value = [value_type(x) for x in value]
144 else:
145 value = value_type(value)
146 filtered_metadata[key][subkey] = value
148 # Optional assembly and genebuild based filtering:
149 if meta_update:
150 # Check assembly and genebuild versions
151 check_assembly_refseq(filtered_metadata)
152 check_assembly_version(filtered_metadata)
153 check_genebuild_version(filtered_metadata)
155 return filtered_metadata
158def check_assembly_refseq(gmeta_out: dict[str, Any]) -> None:
159 """Update the GCA accession to use GCF if it is from RefSeq.
161 Args:
162 genome_metadata: Nested metadata key values from the core metadata table.
163 """
164 assembly = gmeta_out.get("assembly", {})
165 if assembly.get("provider_name"): 165 ↛ 166line 165 didn't jump to line 166 because the condition on line 165 was never true
166 if assembly["provider_name"] == "RefSeq":
167 assembly["accession"] = assembly["accession"].replace("GCA", "GCF")
168 logging.info("GCA accession updated to RefSeq GFC accession.")
169 else:
170 logging.info(f"Meta check 'assembly is RefSeq': Asm provider = {assembly['provider_name']}")
171 else:
172 logging.debug(
173 "Meta filter update to RefSeq accession not done: user meta filter missing: \
174 'assembly.provider_name'"
175 )
178def check_assembly_version(genome_metadata: dict[str, Any]) -> None:
179 """Updates the assembly version of the genome metadata provided.
181 If `version` meta key is not and integer or it is not available, the assembly accession's version
182 will be used instead.
184 Args:
185 genome_metadata: Nested metadata key values from the core metadata table.
187 Raises:
188 ValueError: If both `version` and the assembly accession's version are not integers or are missing.
189 """
190 assembly = genome_metadata["assembly"]
191 version = assembly.get("version")
192 # Check the version is an integer
193 try:
194 assembly["version"] = int(version)
195 except (ValueError, TypeError) as exc:
196 # Get the version from the assembly accession
197 accession = assembly["accession"]
198 version = accession.partition(".")[2]
199 try:
200 assembly["version"] = int(version)
201 except ValueError:
202 raise ValueError(f"Assembly version is not an integer in {assembly}") from exc
203 logging.info(f"Assembly version [v{version}] obtained from assembly accession ({accession}).")
204 else:
205 logging.info(f'Located version [v{assembly["version"]}] info from meta data.')
208def check_genebuild_version(genome_metadata: dict[str, Any]) -> None:
209 """Updates the genebuild version (if not present) from the genebuild ID, removing the latter.
211 Args:
212 genome_metadata: Nested metadata key values from the core metadata table.
214 Raises:
215 ValueError: If there is no genebuild version or ID available.
216 """
217 try:
218 genebuild = genome_metadata["genebuild"]
219 except KeyError:
220 return
221 if "version" not in genebuild:
222 try:
223 genebuild_id = genebuild["id"]
224 except KeyError:
225 # pylint: disable=raise-missing-from
226 raise ValueError("No genebuild version or ID found")
227 genome_metadata["genebuild"]["version"] = str(genebuild_id)
228 # Drop genebuild ID since there is a genebuild version
229 genome_metadata["genebuild"].pop("id", None)
232def convert_dict(meta_dict: dict) -> dict:
233 """Converts text JSON to add type properties from string
235 Args:
236 meta_dict: User meta dictionary with literal string typing to be converted.
237 """
238 new_dict = meta_dict.copy()
239 for key, value in meta_dict.items():
240 if isinstance(value, dict):
241 new_dict[key] = convert_dict(value)
242 else:
243 new_dict[key] = locate(value)
244 return new_dict
247def metadata_dump_setup(
248 db_url: URL, input_filter: StrPath | None, meta_update: bool, append_db: bool
249) -> dict[str, Any]:
250 """Setup main stages of genome meta dump from user input arguments provided.
251 Args:
252 db_url: Target core database URL.
253 input_filter: Input JSON containing subset of meta table values to filter on.
254 no_update: Deactivate additional meta updating.
255 append_db: Append target core database name to output JSON.
257 """
258 dbc = DBConnectionLite(db_url)
259 db_name = None
260 meta_filter = {}
261 if append_db:
262 db_name = db_url.database
264 if input_filter:
265 unconverted_json = get_json(input_filter)
266 meta_filter = convert_dict(unconverted_json)
268 with dbc.session_scope() as session:
269 genome_meta = get_genome_metadata(session, db_name)
270 genome_meta = filter_genome_meta(genome_meta, meta_filter, meta_update)
272 return genome_meta
275def parse_args(arg_list: list[str] | None) -> argparse.Namespace:
276 """Return a populated namespace with the arguments parsed from a list or from the command line.
278 Args:
279 arg_list: List of arguments to parse. If `None`, grab them from the command line.
281 """
282 parser = ArgumentParser(description=__doc__)
283 parser.add_server_arguments(include_database=True, help="server url and core database")
284 parser.add_argument_src_path(
285 "--metafilter", default=None, help="JSON file of nested meta_key:meta_value to filter dump output."
286 )
287 parser.add_argument(
288 "--meta_update",
289 action="store_true",
290 help="Perform assembly and genebuild 'version' metadata checks & update if needed.",
291 )
292 parser.add_argument("--append_db", action="store_true", help="Append core database name to output JSON.")
293 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
294 parser.add_log_arguments(add_log_file=True)
295 return parser.parse_args(arg_list)
298def main(arg_list: list[str] | None = None) -> None:
299 """Main script entry-point.
301 Args:
302 arg_list: Arguments to parse passing list to parse_args().
303 """
304 args = parse_args(arg_list)
305 init_logging_with_args(args)
307 genome_meta = metadata_dump_setup(
308 db_url=args.url, input_filter=args.metafilter, meta_update=args.meta_update, append_db=args.append_db
309 )
311 print(json.dumps(genome_meta, indent=2, sort_keys=True))