Coverage for src/python/ensembl/io/genomio/database/factory.py: 100%

63 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-02-21 15:37 +0000

1# See the NOTICE file distributed with this work for additional information 

2# regarding copyright ownership. 

3# 

4# Licensed under the Apache License, Version 2.0 (the "License"); 

5# you may not use this file except in compliance with the License. 

6# You may obtain a copy of the License at 

7# 

8# http://www.apache.org/licenses/LICENSE-2.0 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, 

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

13# See the License for the specific language governing permissions and 

14# limitations under the License. 

15"""Generates one JSON file per metadata type inside `manifest`, including the manifest itself.""" 

16 

17__all__ = ["format_db_data", "get_core_dbs_metadata"] 

18 

19import argparse 

20import json 

21import logging 

22from pathlib import Path 

23 

24from sqlalchemy.engine import URL 

25 

26import ensembl.io.genomio 

27from ensembl.utils.argparse import ArgumentParser 

28from ensembl.utils.logging import init_logging_with_args 

29from .core_server import CoreServer 

30from .dbconnection_lite import DBConnectionLite 

31 

32 

33def format_db_data(server_url: URL, dbs: list[str], brc_mode: bool = False) -> list[dict]: 

34 """Returns a metadata list from the given databases on a server. 

35 

36 Args: 

37 server_url: Server URL where all the databases are hosted. 

38 dbs: List of database names. 

39 brc_mode: If true, assign ``BRC4.organism_abbrev`` as the species, and ``BRC4.component`` as the 

40 division. Otherwise, the species will be ``species.production_name`` and the division will be 

41 ``species.division``. 

42 

43 Returns: 

44 List of dictionaries with 3 keys: "database", "species" and "division". 

45 """ 

46 databases_data = [] 

47 for db_name in dbs: 

48 logging.debug(f"Get metadata for {db_name}") 

49 db_url = server_url.set(database=db_name) 

50 core_db = DBConnectionLite(db_url) 

51 

52 prod_name = core_db.get_meta_value("species.production_name") 

53 species = prod_name 

54 division = core_db.get_meta_value("species.division") 

55 accession = core_db.get_meta_value("assembly.accession") 

56 project_release = core_db.get_project_release() 

57 

58 if brc_mode: 

59 brc_organism = core_db.get_meta_value("BRC4.organism_abbrev") 

60 brc_component = core_db.get_meta_value("BRC4.component") 

61 if brc_organism is not None: 

62 species = brc_organism 

63 if brc_component is not None: 

64 division = brc_component 

65 

66 if not division: 

67 division = "all" 

68 

69 server_data = { 

70 "host": db_url.host, 

71 "user": db_url.username, 

72 "port": db_url.port, 

73 "password": db_url.password, 

74 "database": db_url.database, 

75 } 

76 db_data = { 

77 "server": server_data, 

78 "production_name": prod_name, 

79 "species": species, 

80 "division": division, 

81 "accession": accession, 

82 "release": project_release, 

83 } 

84 

85 databases_data.append(db_data) 

86 return databases_data 

87 

88 

89def get_core_dbs_metadata( 

90 server_url: URL, 

91 *, 

92 prefix: str = "", 

93 build: int | None = None, 

94 version: int | None = None, 

95 db_regex: str = "", 

96 db_list: Path | None = None, 

97 brc_mode: bool = False, 

98) -> list[dict]: 

99 """Returns all the metadata fetched for the selected core databases. 

100 

101 Args: 

102 server_url: Server URL where the core databases are stored. 

103 prefix: Filter by prefix (no "_" is added automatically). 

104 build: Filter by VEuPathDB build number. 

105 version: Filter by Ensembl version. 

106 db_regex: Filter by dbname regular expression. 

107 db_list: Explicit list of database names. 

108 brc_mode: Enable BRC mode. 

109 

110 Returns: 

111 List of dictionaries with 3 keys: "database", "species" and "division". 

112 """ 

113 db_list_file = None 

114 if db_list: 

115 with db_list.open("r") as infile_fh: 

116 db_list_file = [line.strip() for line in infile_fh] 

117 # Get all database names 

118 server = CoreServer(server_url) 

119 logging.debug("Fetching databases...") 

120 databases = server.get_cores( 

121 prefix=prefix, build=build, version=version, dbname_re=db_regex, db_list=db_list_file 

122 ) 

123 logging.info(f"Got {len(databases)} databases") 

124 logging.debug("\n".join(databases)) 

125 return format_db_data(server_url, databases, brc_mode) 

126 

127 

128def parse_args(arg_list: list[str] | None) -> argparse.Namespace: 

129 """Return a populated namespace with the arguments parsed from a list or from the command line. 

130 

131 Args: 

132 arg_list: List of arguments to parse. If `None`, grab them from the command line. 

133 

134 """ 

135 parser = ArgumentParser(description=__doc__) 

136 parser.add_server_arguments() 

137 # Add filter arguments 

138 parser.add_argument("--prefix", default="", help="Prefix to filter the databases") 

139 parser.add_argument("--build", type=int, default=None, help="Build to filter the databases") 

140 parser.add_argument("--release", type=int, default=None, help="EnsEMBL release to filter the databases") 

141 parser.add_argument("--db_regex", default="", help="Regular expression to match database names against") 

142 parser.add_argument_src_path("--db_list", help="File with one database per line to load") 

143 # Add flags 

144 parser.add_argument( 

145 "--brc_mode", 

146 action="store_true", 

147 help="Enable BRC mode, i.e. use organism_abbrev for species, component for division", 

148 ) 

149 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__) 

150 parser.add_log_arguments() 

151 return parser.parse_args(arg_list) 

152 

153 

154def main(arg_list: list[str] | None = None) -> None: 

155 """Main script entry-point. 

156 

157 Args: 

158 arg_list: Arguments to parse passing list to parse_args(). 

159 

160 """ 

161 args = parse_args(arg_list) 

162 init_logging_with_args(args) 

163 

164 databases_data = get_core_dbs_metadata( 

165 server_url=args.url, 

166 prefix=args.prefix, 

167 build=args.build, 

168 version=args.release, 

169 db_regex=args.db_regex, 

170 db_list=args.db_list, 

171 brc_mode=args.brc_mode, 

172 ) 

173 print(json.dumps(databases_data, sort_keys=True, indent=4))