Coverage for src/python/ensembl/io/genomio/database/factory.py: 100%
63 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
1# See the NOTICE file distributed with this work for additional information
2# regarding copyright ownership.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Generates one JSON file per metadata type inside `manifest`, including the manifest itself."""
17__all__ = ["format_db_data", "get_core_dbs_metadata"]
19import argparse
20import json
21import logging
22from pathlib import Path
24from sqlalchemy.engine import URL
26import ensembl.io.genomio
27from ensembl.utils.argparse import ArgumentParser
28from ensembl.utils.logging import init_logging_with_args
29from .core_server import CoreServer
30from .dbconnection_lite import DBConnectionLite
33def format_db_data(server_url: URL, dbs: list[str], brc_mode: bool = False) -> list[dict]:
34 """Returns a metadata list from the given databases on a server.
36 Args:
37 server_url: Server URL where all the databases are hosted.
38 dbs: List of database names.
39 brc_mode: If true, assign ``BRC4.organism_abbrev`` as the species, and ``BRC4.component`` as the
40 division. Otherwise, the species will be ``species.production_name`` and the division will be
41 ``species.division``.
43 Returns:
44 List of dictionaries with 3 keys: "database", "species" and "division".
45 """
46 databases_data = []
47 for db_name in dbs:
48 logging.debug(f"Get metadata for {db_name}")
49 db_url = server_url.set(database=db_name)
50 core_db = DBConnectionLite(db_url)
52 prod_name = core_db.get_meta_value("species.production_name")
53 species = prod_name
54 division = core_db.get_meta_value("species.division")
55 accession = core_db.get_meta_value("assembly.accession")
56 project_release = core_db.get_project_release()
58 if brc_mode:
59 brc_organism = core_db.get_meta_value("BRC4.organism_abbrev")
60 brc_component = core_db.get_meta_value("BRC4.component")
61 if brc_organism is not None:
62 species = brc_organism
63 if brc_component is not None:
64 division = brc_component
66 if not division:
67 division = "all"
69 server_data = {
70 "host": db_url.host,
71 "user": db_url.username,
72 "port": db_url.port,
73 "password": db_url.password,
74 "database": db_url.database,
75 }
76 db_data = {
77 "server": server_data,
78 "production_name": prod_name,
79 "species": species,
80 "division": division,
81 "accession": accession,
82 "release": project_release,
83 }
85 databases_data.append(db_data)
86 return databases_data
89def get_core_dbs_metadata(
90 server_url: URL,
91 *,
92 prefix: str = "",
93 build: int | None = None,
94 version: int | None = None,
95 db_regex: str = "",
96 db_list: Path | None = None,
97 brc_mode: bool = False,
98) -> list[dict]:
99 """Returns all the metadata fetched for the selected core databases.
101 Args:
102 server_url: Server URL where the core databases are stored.
103 prefix: Filter by prefix (no "_" is added automatically).
104 build: Filter by VEuPathDB build number.
105 version: Filter by Ensembl version.
106 db_regex: Filter by dbname regular expression.
107 db_list: Explicit list of database names.
108 brc_mode: Enable BRC mode.
110 Returns:
111 List of dictionaries with 3 keys: "database", "species" and "division".
112 """
113 db_list_file = None
114 if db_list:
115 with db_list.open("r") as infile_fh:
116 db_list_file = [line.strip() for line in infile_fh]
117 # Get all database names
118 server = CoreServer(server_url)
119 logging.debug("Fetching databases...")
120 databases = server.get_cores(
121 prefix=prefix, build=build, version=version, dbname_re=db_regex, db_list=db_list_file
122 )
123 logging.info(f"Got {len(databases)} databases")
124 logging.debug("\n".join(databases))
125 return format_db_data(server_url, databases, brc_mode)
128def parse_args(arg_list: list[str] | None) -> argparse.Namespace:
129 """Return a populated namespace with the arguments parsed from a list or from the command line.
131 Args:
132 arg_list: List of arguments to parse. If `None`, grab them from the command line.
134 """
135 parser = ArgumentParser(description=__doc__)
136 parser.add_server_arguments()
137 # Add filter arguments
138 parser.add_argument("--prefix", default="", help="Prefix to filter the databases")
139 parser.add_argument("--build", type=int, default=None, help="Build to filter the databases")
140 parser.add_argument("--release", type=int, default=None, help="EnsEMBL release to filter the databases")
141 parser.add_argument("--db_regex", default="", help="Regular expression to match database names against")
142 parser.add_argument_src_path("--db_list", help="File with one database per line to load")
143 # Add flags
144 parser.add_argument(
145 "--brc_mode",
146 action="store_true",
147 help="Enable BRC mode, i.e. use organism_abbrev for species, component for division",
148 )
149 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
150 parser.add_log_arguments()
151 return parser.parse_args(arg_list)
154def main(arg_list: list[str] | None = None) -> None:
155 """Main script entry-point.
157 Args:
158 arg_list: Arguments to parse passing list to parse_args().
160 """
161 args = parse_args(arg_list)
162 init_logging_with_args(args)
164 databases_data = get_core_dbs_metadata(
165 server_url=args.url,
166 prefix=args.prefix,
167 build=args.build,
168 version=args.release,
169 db_regex=args.db_regex,
170 db_list=args.db_list,
171 brc_mode=args.brc_mode,
172 )
173 print(json.dumps(databases_data, sort_keys=True, indent=4))