Coverage for src/python/ensembl/io/genomio/genome_metadata/dump.py: 86%

132 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-02-21 15:37 +0000

1# See the NOTICE file distributed with this work for additional information 

2# regarding copyright ownership. 

3# 

4# Licensed under the Apache License, Version 2.0 (the "License"); 

5# you may not use this file except in compliance with the License. 

6# You may obtain a copy of the License at 

7# 

8# http://www.apache.org/licenses/LICENSE-2.0 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, 

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

13# See the License for the specific language governing permissions and 

14# limitations under the License. 

15"""Generates a JSON file representing the genome metadata from a core database.""" 

16 

17__all__ = [ 

18 "get_genome_metadata", 

19 "filter_genome_meta", 

20 "check_assembly_version", 

21 "check_genebuild_version", 

22 "metadata_dump_setup", 

23] 

24 

25import argparse 

26import json 

27from typing import Any, Type 

28import logging 

29from pydoc import locate 

30 

31from sqlalchemy import select 

32from sqlalchemy.orm import Session 

33from sqlalchemy.engine import URL 

34 

35from ensembl.core.models import Meta 

36import ensembl.io.genomio 

37from ensembl.io.genomio.utils.json_utils import get_json 

38from ensembl.io.genomio.database import DBConnectionLite 

39from ensembl.utils.argparse import ArgumentParser 

40from ensembl.utils import StrPath 

41from ensembl.utils.logging import init_logging_with_args 

42 

43 

44DEFAULT_FILTER: dict[str, dict[str, Type]] = { 

45 "database": {"name": str}, 

46 "added_seq": {"region_name": str}, 

47 "annotation": {"provider_name": str, "provider_url": str}, 

48 "assembly": { 

49 "accession": str, 

50 "date": str, 

51 "name": str, 

52 "provider_name": str, 

53 "provider_url": str, 

54 "version": int, 

55 }, 

56 "BRC4": {"organism_abbrev": str, "component": str}, 

57 "genebuild": {"id": str, "method": str, "method_display": str, "start_date": str, "version": str}, 

58 "species": { 

59 "alias": str, 

60 "annotation_source": str, 

61 "display_name": str, 

62 "division": str, 

63 "production_name": str, 

64 "scientific_name": str, 

65 "strain": str, 

66 "taxonomy_id": int, 

67 }, 

68} 

69 

70 

71def get_genome_metadata(session: Session, db_name: str | None) -> dict[str, Any]: 

72 """Returns the meta table content from the core database in a nested dictionary. 

73 

74 Args: 

75 session: Session for the current core. 

76 db_name: Target database name 

77 """ 

78 genome_metadata: dict[str, Any] = {} 

79 

80 meta_statement = select(Meta) 

81 for row in session.execute(meta_statement).unique().all(): 

82 meta_key = row[0].meta_key 

83 meta_value = row[0].meta_value 

84 (main_key, _, subkey) = meta_key.partition(".") 

85 # Use empty string as subkey when no "." found to simplify dictionary creation 

86 if main_key in genome_metadata: 

87 if subkey in genome_metadata[main_key]: 

88 genome_metadata[main_key][subkey].append(meta_value) 

89 else: 

90 genome_metadata[main_key][subkey] = [meta_value] 

91 else: 

92 genome_metadata[main_key] = {subkey: [meta_value]} 

93 

94 if db_name: 

95 genome_metadata["database"] = {"name": f"{db_name}"} 

96 

97 # Parse genome metadata to simplify dictionary and check data consistency 

98 for main_key, subkeys_dict in genome_metadata.items(): 

99 # Replace single-value lists by the value itself 

100 for subkey, value in subkeys_dict.items(): 

101 if len(value) == 1: 

102 subkeys_dict[subkey] = value[0] 

103 # Remove nested dictionary if it only has "" as key, passing its value to the main key 

104 if "" in subkeys_dict: 

105 if len(subkeys_dict) == 1: 

106 genome_metadata[main_key] = subkeys_dict.pop("") 

107 else: 

108 raise ValueError(f"Unexpected meta keys for '{main_key}': {', '.join(subkeys_dict.keys())}") 

109 return genome_metadata 

110 

111 

112def filter_genome_meta( 

113 genome_metadata: dict[str, Any], metafilter: dict | None, meta_update: bool 

114) -> dict[str, Any]: 

115 """Returns a filtered metadata dictionary with only the predefined keys in METADATA_FILTER. 

116 

117 Also converts to expected data types (to follow the genome JSON schema). 

118 

119 Args: 

120 genome_metadata: Nested metadata key values from the core metadata table. 

121 metafilter: Input JSON containing subset of meta table values to filter on. 

122 meta_update: Deactivates additional meta updating. 

123 

124 """ 

125 filtered_metadata: dict[str, Any] = {} 

126 

127 if metafilter: 

128 metadata_filter: dict[str, dict[str, type]] = metafilter 

129 else: 

130 metadata_filter = DEFAULT_FILTER 

131 

132 for key, subfilter in metadata_filter.items(): 

133 if key in genome_metadata: 

134 filtered_metadata[key] = {} 

135 for subkey, value_type in subfilter.items(): 

136 if isinstance(value_type, str): 

137 value_type = type(value_type) 

138 if isinstance(value_type, int): 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true

139 value_type = type(value_type) 

140 if subkey in genome_metadata[key]: 

141 value = genome_metadata[key][subkey] 

142 if isinstance(value, list): 

143 value = [value_type(x) for x in value] 

144 else: 

145 value = value_type(value) 

146 filtered_metadata[key][subkey] = value 

147 

148 # Optional assembly and genebuild based filtering: 

149 if meta_update: 

150 # Check assembly and genebuild versions 

151 check_assembly_refseq(filtered_metadata) 

152 check_assembly_version(filtered_metadata) 

153 check_genebuild_version(filtered_metadata) 

154 

155 return filtered_metadata 

156 

157 

158def check_assembly_refseq(gmeta_out: dict[str, Any]) -> None: 

159 """Update the GCA accession to use GCF if it is from RefSeq. 

160 

161 Args: 

162 genome_metadata: Nested metadata key values from the core metadata table. 

163 """ 

164 assembly = gmeta_out.get("assembly", {}) 

165 if assembly.get("provider_name"): 165 ↛ 166line 165 didn't jump to line 166 because the condition on line 165 was never true

166 if assembly["provider_name"] == "RefSeq": 

167 assembly["accession"] = assembly["accession"].replace("GCA", "GCF") 

168 logging.info("GCA accession updated to RefSeq GFC accession.") 

169 else: 

170 logging.info(f"Meta check 'assembly is RefSeq': Asm provider = {assembly['provider_name']}") 

171 else: 

172 logging.debug( 

173 "Meta filter update to RefSeq accession not done: user meta filter missing: \ 

174 'assembly.provider_name'" 

175 ) 

176 

177 

178def check_assembly_version(genome_metadata: dict[str, Any]) -> None: 

179 """Updates the assembly version of the genome metadata provided. 

180 

181 If `version` meta key is not and integer or it is not available, the assembly accession's version 

182 will be used instead. 

183 

184 Args: 

185 genome_metadata: Nested metadata key values from the core metadata table. 

186 

187 Raises: 

188 ValueError: If both `version` and the assembly accession's version are not integers or are missing. 

189 """ 

190 assembly = genome_metadata["assembly"] 

191 version = assembly.get("version") 

192 # Check the version is an integer 

193 try: 

194 assembly["version"] = int(version) 

195 except (ValueError, TypeError) as exc: 

196 # Get the version from the assembly accession 

197 accession = assembly["accession"] 

198 version = accession.partition(".")[2] 

199 try: 

200 assembly["version"] = int(version) 

201 except ValueError: 

202 raise ValueError(f"Assembly version is not an integer in {assembly}") from exc 

203 logging.info(f"Assembly version [v{version}] obtained from assembly accession ({accession}).") 

204 else: 

205 logging.info(f'Located version [v{assembly["version"]}] info from meta data.') 

206 

207 

208def check_genebuild_version(genome_metadata: dict[str, Any]) -> None: 

209 """Updates the genebuild version (if not present) from the genebuild ID, removing the latter. 

210 

211 Args: 

212 genome_metadata: Nested metadata key values from the core metadata table. 

213 

214 Raises: 

215 ValueError: If there is no genebuild version or ID available. 

216 """ 

217 try: 

218 genebuild = genome_metadata["genebuild"] 

219 except KeyError: 

220 return 

221 if "version" not in genebuild: 

222 try: 

223 genebuild_id = genebuild["id"] 

224 except KeyError: 

225 # pylint: disable=raise-missing-from 

226 raise ValueError("No genebuild version or ID found") 

227 genome_metadata["genebuild"]["version"] = str(genebuild_id) 

228 # Drop genebuild ID since there is a genebuild version 

229 genome_metadata["genebuild"].pop("id", None) 

230 

231 

232def convert_dict(meta_dict: dict) -> dict: 

233 """Converts text JSON to add type properties from string 

234 

235 Args: 

236 meta_dict: User meta dictionary with literal string typing to be converted. 

237 """ 

238 new_dict = meta_dict.copy() 

239 for key, value in meta_dict.items(): 

240 if isinstance(value, dict): 

241 new_dict[key] = convert_dict(value) 

242 else: 

243 new_dict[key] = locate(value) 

244 return new_dict 

245 

246 

247def metadata_dump_setup( 

248 db_url: URL, input_filter: StrPath | None, meta_update: bool, append_db: bool 

249) -> dict[str, Any]: 

250 """Setup main stages of genome meta dump from user input arguments provided. 

251 Args: 

252 db_url: Target core database URL. 

253 input_filter: Input JSON containing subset of meta table values to filter on. 

254 no_update: Deactivate additional meta updating. 

255 append_db: Append target core database name to output JSON. 

256 

257 """ 

258 dbc = DBConnectionLite(db_url) 

259 db_name = None 

260 meta_filter = {} 

261 if append_db: 

262 db_name = db_url.database 

263 

264 if input_filter: 

265 unconverted_json = get_json(input_filter) 

266 meta_filter = convert_dict(unconverted_json) 

267 

268 with dbc.session_scope() as session: 

269 genome_meta = get_genome_metadata(session, db_name) 

270 genome_meta = filter_genome_meta(genome_meta, meta_filter, meta_update) 

271 

272 return genome_meta 

273 

274 

275def parse_args(arg_list: list[str] | None) -> argparse.Namespace: 

276 """Return a populated namespace with the arguments parsed from a list or from the command line. 

277 

278 Args: 

279 arg_list: List of arguments to parse. If `None`, grab them from the command line. 

280 

281 """ 

282 parser = ArgumentParser(description=__doc__) 

283 parser.add_server_arguments(include_database=True, help="server url and core database") 

284 parser.add_argument_src_path( 

285 "--metafilter", default=None, help="JSON file of nested meta_key:meta_value to filter dump output." 

286 ) 

287 parser.add_argument( 

288 "--meta_update", 

289 action="store_true", 

290 help="Perform assembly and genebuild 'version' metadata checks & update if needed.", 

291 ) 

292 parser.add_argument("--append_db", action="store_true", help="Append core database name to output JSON.") 

293 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__) 

294 parser.add_log_arguments(add_log_file=True) 

295 return parser.parse_args(arg_list) 

296 

297 

298def main(arg_list: list[str] | None = None) -> None: 

299 """Main script entry-point. 

300 

301 Args: 

302 arg_list: Arguments to parse passing list to parse_args(). 

303 """ 

304 args = parse_args(arg_list) 

305 init_logging_with_args(args) 

306 

307 genome_meta = metadata_dump_setup( 

308 db_url=args.url, input_filter=args.metafilter, meta_update=args.meta_update, append_db=args.append_db 

309 ) 

310 

311 print(json.dumps(genome_meta, indent=2, sort_keys=True))