Coverage for src/python/ensembl/io/genomio/seq_region/dump.py: 93%

124 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-02-21 15:37 +0000

1# See the NOTICE file distributed with this work for additional information 

2# regarding copyright ownership. 

3# 

4# Licensed under the Apache License, Version 2.0 (the "License"); 

5# you may not use this file except in compliance with the License. 

6# You may obtain a copy of the License at 

7# 

8# http://www.apache.org/licenses/LICENSE-2.0 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, 

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

13# See the License for the specific language governing permissions and 

14# limitations under the License. 

15"""Fetch all the sequence regions from a core database and print them in JSON format.""" 

16 

17__all__ = [ 

18 "fetch_coord_systems", 

19 "get_seq_regions", 

20 "add_attribs", 

21 "get_synonyms", 

22 "get_karyotype", 

23] 

24 

25import json 

26import logging 

27from pathlib import Path 

28from typing import Any, Iterator 

29 

30from sqlalchemy import select 

31from sqlalchemy.orm import Session, joinedload 

32 

33from ensembl.core.models import CoordSystem, SeqRegion, SeqRegionSynonym, SeqRegionAttrib 

34import ensembl.io.genomio 

35from ensembl.io.genomio.database import DBConnectionLite 

36from ensembl.io.genomio.external_db.db_map import get_external_db_map, DEFAULT_EXTERNAL_DB_MAP 

37from ensembl.utils.argparse import ArgumentParser 

38from ensembl.utils.logging import init_logging_with_args 

39 

40 

41_KARYOTYPE_STRUCTURE = {"TEL": "telomere", "ACEN": "centromere"} 

42 

43 

44def fetch_coord_systems(session: Session) -> Iterator[CoordSystem]: 

45 """Retrieve the coord_system metadata from the current core. 

46 

47 Args: 

48 session: Session for the current core database. 

49 

50 Yields: 

51 All default coord_systems in the core database. 

52 """ 

53 coord_system_select = select(CoordSystem).filter(CoordSystem.attrib.like(r"%default_version%")) 

54 for row in session.execute(coord_system_select).unique().all(): 

55 coord: CoordSystem = row[0] 

56 yield coord 

57 

58 

59def fetch_seq_regions(session: Session, coord_system: CoordSystem) -> Iterator[SeqRegion]: 

60 """Retrieve the seq_region metadata for a given coord_system, with accessory tables cached. 

61 

62 Args: 

63 session: Session for the current core database. 

64 coord_system: Coord_system to get seq regions. 

65 

66 Yields: 

67 All seq_regions for the coord_system. 

68 """ 

69 seq_region_select = ( 

70 select(SeqRegion) 

71 .where(SeqRegion.coord_system_id == coord_system.coord_system_id) 

72 .options( 

73 joinedload(SeqRegion.seq_region_synonym).joinedload(SeqRegionSynonym.external_db), 

74 joinedload(SeqRegion.seq_region_attrib).joinedload(SeqRegionAttrib.attrib_type), 

75 joinedload(SeqRegion.karyotype), 

76 ) 

77 ) 

78 for row in session.execute(seq_region_select).unique().all(): 

79 seq_region: SeqRegion = row[0] 

80 yield seq_region 

81 

82 

83def get_attribs_dict(seq_region: SeqRegion) -> dict[str, Any]: 

84 """Returns a dict of attrib code-value for all the attributes of the given sequence region.""" 

85 return {attrib.attrib_type.code: attrib.value for attrib in seq_region.seq_region_attrib} 

86 

87 

88def add_attribs(seq_region: dict, seq_region_attrib: dict) -> None: 

89 """Map seq_regions attribs to a specific name and type defined below. 

90 

91 Args: 

92 seq_region: A seq_region dict to modify. 

93 seq_region_attrib: The attribs for this seq_region. 

94 """ 

95 bool_attribs = { 

96 "circular_seq": "circular", 

97 "non_ref": "non_ref", 

98 } 

99 int_attribs = { 

100 "codon_table": "codon_table", 

101 } 

102 string_attribs = { 

103 "BRC4_seq_region_name": "BRC4_seq_region_name", 

104 "EBI_seq_region_name": "EBI_seq_region_name", 

105 "coord_system_tag": "coord_system_level", 

106 "sequence_location": "location", 

107 } 

108 

109 for name, key in bool_attribs.items(): 

110 # Make sure "0" means False, i.e. not added 

111 value = int(seq_region_attrib.get(name, "0")) 

112 if value: 

113 seq_region[key] = bool(value) 

114 

115 for name, key in int_attribs.items(): 

116 value = seq_region_attrib.get(name, "") 

117 if value: 

118 seq_region[key] = int(value) 

119 

120 for name, key in string_attribs.items(): 

121 value = seq_region_attrib.get(name, "") 

122 if value: 

123 seq_region[key] = str(value) 

124 

125 

126def get_synonyms(seq_region: SeqRegion, external_db_map: dict[str, str]) -> list[dict[str, str]]: 

127 """Get all synonyms for a given seq_region. Use the mapping for synonym source names. 

128 

129 Args: 

130 seq_region: Seq_region from which the synonyms are extracted. 

131 external_db_map: To map the synonym source names. 

132 

133 Returns: 

134 List of all synonyms as a dict with 'name' and 'source' keys. 

135 """ 

136 synonyms = seq_region.seq_region_synonym 

137 syns = [] 

138 if synonyms: 

139 for syn in synonyms: 

140 if syn.external_db: 

141 source = syn.external_db.db_name 

142 if source in external_db_map: 

143 source = external_db_map[source] 

144 syn_obj = {"name": syn.synonym, "source": source} 

145 else: 

146 syn_obj = {"name": syn.synonym} 

147 syns.append(syn_obj) 

148 

149 syns = sorted(syns, key=lambda syn: (syn["name"], syn.get("source", ""))) 

150 return syns 

151 

152 

153def get_karyotype(seq_region: SeqRegion) -> list[dict[str, str]]: 

154 """Given a seq_region, extract the karyotype bands. 

155 

156 Args: 

157 seq_region: The seq_region from which the karyotype bands are extracted. 

158 

159 Returns: 

160 List of all karyotype bands as a dict with values 'start', 'end', 'name' 'stain', 'structure'. 

161 """ 

162 bands = seq_region.karyotype 

163 kars = [] 

164 if bands: 

165 for band in bands: 

166 kar = {"start": band.seq_region_start, "end": band.seq_region_end} 

167 if band.band: 

168 kar["name"] = band.band 

169 if band.stain: 

170 kar["stain"] = band.stain 

171 structure = _KARYOTYPE_STRUCTURE.get(band.stain, "") 

172 if structure: 

173 kar["structure"] = structure 

174 kars.append(kar) 

175 

176 kars = sorted(kars, key=lambda kar: kar.get("name", "")) 

177 return kars 

178 

179 

180def get_added_sequence(seq_region: SeqRegion) -> dict[str, str | dict[str, str]]: 

181 """Extracts added sequence information of the given sequence region. 

182 

183 Args: 

184 seq_region: Sequence region. 

185 

186 Returns: 

187 Accession as well as assembly and annotation provider information of the added sequence. 

188 """ 

189 attribs = get_attribs_dict(seq_region) 

190 accession = attribs.get("added_seq_accession") 

191 if accession is None: 

192 return {} 

193 

194 added_sequence = { 

195 "accession": accession, 

196 } 

197 

198 # Assembly provider 

199 assembly_provider = { 

200 "name": attribs.get("added_seq_asm_pr_nam"), 

201 "url": attribs.get("added_seq_asm_pr_url"), 

202 } 

203 if assembly_provider["name"] and assembly_provider["url"]: 

204 added_sequence["assembly_provider"] = assembly_provider 

205 

206 # annotation provider 

207 annotation_provider = { 

208 "name": attribs.get("added_seq_ann_pr_nam"), 

209 "url": attribs.get("added_seq_ann_pr_url"), 

210 } 

211 if annotation_provider["name"] and annotation_provider["url"]: 

212 added_sequence["annotation_provider"] = annotation_provider 

213 

214 return added_sequence 

215 

216 

217def get_seq_regions(session: Session, external_db_map: dict) -> list[SeqRegion]: 

218 """Returns all the sequence regions from the current core database. 

219 

220 Include synonyms, attribs and karyotypes. Only the top level sequences are exported. 

221 

222 Args: 

223 session: Session from the current core database. 

224 external_db_map: Mapping of external_db names for the synonyms. 

225 

226 """ 

227 seq_regions = [] 

228 

229 for coord_system in fetch_coord_systems(session): 

230 logging.debug(f"Dump coord {coord_system.name}") 

231 for seqr in fetch_seq_regions(session, coord_system): 

232 seq_region: dict[str, Any] = {} 

233 seq_region = {"name": seqr.name, "length": seqr.length} 

234 synonyms = get_synonyms(seqr, external_db_map) 

235 if synonyms: 

236 seq_region["synonyms"] = synonyms 

237 

238 attribs = get_attribs_dict(seqr) 

239 if not attribs or "toplevel" not in attribs: 

240 # Skip seq_region without attribs or not toplevel 

241 continue 

242 add_attribs(seq_region, attribs) 

243 

244 karyotype = get_karyotype(seqr) 

245 if karyotype: 

246 seq_region["karyotype_bands"] = karyotype 

247 

248 added_seq = get_added_sequence(seqr) 

249 if added_seq: 

250 seq_region["added_sequence"] = added_seq 

251 

252 if "coord_system_level" not in seq_region: 

253 seq_region["coord_system_level"] = coord_system.name 

254 

255 seq_regions.append(seq_region) 

256 

257 seq_regions = sorted(seq_regions, key=lambda seqr: (seqr["coord_system_level"], seqr["name"])) 

258 return seq_regions 

259 

260 

261def main() -> None: 

262 """Main script entry-point.""" 

263 parser = ArgumentParser( 

264 description="Fetch all the sequence regions from a core database and print them in JSON format." 

265 ) 

266 parser.add_server_arguments(include_database=True) 

267 parser.add_argument_src_path( 

268 "--external_db_map", default=DEFAULT_EXTERNAL_DB_MAP.resolve(), help="File with external_db mapping" 

269 ) 

270 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__) 

271 parser.add_log_arguments(add_log_file=True) 

272 args = parser.parse_args() 

273 init_logging_with_args(args) 

274 

275 dbc = DBConnectionLite(args.url) 

276 

277 external_map_path = Path(args.external_db_map) 

278 external_map = get_external_db_map(external_map_path) 

279 

280 with dbc.session_scope() as session: 

281 seq_regions = get_seq_regions(session, external_map) 

282 

283 print(json.dumps(seq_regions, indent=2, sort_keys=True))