Coverage for src/python/ensembl/io/genomio/seq_region/dump.py: 93%
124 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
1# See the NOTICE file distributed with this work for additional information
2# regarding copyright ownership.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Fetch all the sequence regions from a core database and print them in JSON format."""
17__all__ = [
18 "fetch_coord_systems",
19 "get_seq_regions",
20 "add_attribs",
21 "get_synonyms",
22 "get_karyotype",
23]
25import json
26import logging
27from pathlib import Path
28from typing import Any, Iterator
30from sqlalchemy import select
31from sqlalchemy.orm import Session, joinedload
33from ensembl.core.models import CoordSystem, SeqRegion, SeqRegionSynonym, SeqRegionAttrib
34import ensembl.io.genomio
35from ensembl.io.genomio.database import DBConnectionLite
36from ensembl.io.genomio.external_db.db_map import get_external_db_map, DEFAULT_EXTERNAL_DB_MAP
37from ensembl.utils.argparse import ArgumentParser
38from ensembl.utils.logging import init_logging_with_args
41_KARYOTYPE_STRUCTURE = {"TEL": "telomere", "ACEN": "centromere"}
44def fetch_coord_systems(session: Session) -> Iterator[CoordSystem]:
45 """Retrieve the coord_system metadata from the current core.
47 Args:
48 session: Session for the current core database.
50 Yields:
51 All default coord_systems in the core database.
52 """
53 coord_system_select = select(CoordSystem).filter(CoordSystem.attrib.like(r"%default_version%"))
54 for row in session.execute(coord_system_select).unique().all():
55 coord: CoordSystem = row[0]
56 yield coord
59def fetch_seq_regions(session: Session, coord_system: CoordSystem) -> Iterator[SeqRegion]:
60 """Retrieve the seq_region metadata for a given coord_system, with accessory tables cached.
62 Args:
63 session: Session for the current core database.
64 coord_system: Coord_system to get seq regions.
66 Yields:
67 All seq_regions for the coord_system.
68 """
69 seq_region_select = (
70 select(SeqRegion)
71 .where(SeqRegion.coord_system_id == coord_system.coord_system_id)
72 .options(
73 joinedload(SeqRegion.seq_region_synonym).joinedload(SeqRegionSynonym.external_db),
74 joinedload(SeqRegion.seq_region_attrib).joinedload(SeqRegionAttrib.attrib_type),
75 joinedload(SeqRegion.karyotype),
76 )
77 )
78 for row in session.execute(seq_region_select).unique().all():
79 seq_region: SeqRegion = row[0]
80 yield seq_region
83def get_attribs_dict(seq_region: SeqRegion) -> dict[str, Any]:
84 """Returns a dict of attrib code-value for all the attributes of the given sequence region."""
85 return {attrib.attrib_type.code: attrib.value for attrib in seq_region.seq_region_attrib}
88def add_attribs(seq_region: dict, seq_region_attrib: dict) -> None:
89 """Map seq_regions attribs to a specific name and type defined below.
91 Args:
92 seq_region: A seq_region dict to modify.
93 seq_region_attrib: The attribs for this seq_region.
94 """
95 bool_attribs = {
96 "circular_seq": "circular",
97 "non_ref": "non_ref",
98 }
99 int_attribs = {
100 "codon_table": "codon_table",
101 }
102 string_attribs = {
103 "BRC4_seq_region_name": "BRC4_seq_region_name",
104 "EBI_seq_region_name": "EBI_seq_region_name",
105 "coord_system_tag": "coord_system_level",
106 "sequence_location": "location",
107 }
109 for name, key in bool_attribs.items():
110 # Make sure "0" means False, i.e. not added
111 value = int(seq_region_attrib.get(name, "0"))
112 if value:
113 seq_region[key] = bool(value)
115 for name, key in int_attribs.items():
116 value = seq_region_attrib.get(name, "")
117 if value:
118 seq_region[key] = int(value)
120 for name, key in string_attribs.items():
121 value = seq_region_attrib.get(name, "")
122 if value:
123 seq_region[key] = str(value)
126def get_synonyms(seq_region: SeqRegion, external_db_map: dict[str, str]) -> list[dict[str, str]]:
127 """Get all synonyms for a given seq_region. Use the mapping for synonym source names.
129 Args:
130 seq_region: Seq_region from which the synonyms are extracted.
131 external_db_map: To map the synonym source names.
133 Returns:
134 List of all synonyms as a dict with 'name' and 'source' keys.
135 """
136 synonyms = seq_region.seq_region_synonym
137 syns = []
138 if synonyms:
139 for syn in synonyms:
140 if syn.external_db:
141 source = syn.external_db.db_name
142 if source in external_db_map:
143 source = external_db_map[source]
144 syn_obj = {"name": syn.synonym, "source": source}
145 else:
146 syn_obj = {"name": syn.synonym}
147 syns.append(syn_obj)
149 syns = sorted(syns, key=lambda syn: (syn["name"], syn.get("source", "")))
150 return syns
153def get_karyotype(seq_region: SeqRegion) -> list[dict[str, str]]:
154 """Given a seq_region, extract the karyotype bands.
156 Args:
157 seq_region: The seq_region from which the karyotype bands are extracted.
159 Returns:
160 List of all karyotype bands as a dict with values 'start', 'end', 'name' 'stain', 'structure'.
161 """
162 bands = seq_region.karyotype
163 kars = []
164 if bands:
165 for band in bands:
166 kar = {"start": band.seq_region_start, "end": band.seq_region_end}
167 if band.band:
168 kar["name"] = band.band
169 if band.stain:
170 kar["stain"] = band.stain
171 structure = _KARYOTYPE_STRUCTURE.get(band.stain, "")
172 if structure:
173 kar["structure"] = structure
174 kars.append(kar)
176 kars = sorted(kars, key=lambda kar: kar.get("name", ""))
177 return kars
180def get_added_sequence(seq_region: SeqRegion) -> dict[str, str | dict[str, str]]:
181 """Extracts added sequence information of the given sequence region.
183 Args:
184 seq_region: Sequence region.
186 Returns:
187 Accession as well as assembly and annotation provider information of the added sequence.
188 """
189 attribs = get_attribs_dict(seq_region)
190 accession = attribs.get("added_seq_accession")
191 if accession is None:
192 return {}
194 added_sequence = {
195 "accession": accession,
196 }
198 # Assembly provider
199 assembly_provider = {
200 "name": attribs.get("added_seq_asm_pr_nam"),
201 "url": attribs.get("added_seq_asm_pr_url"),
202 }
203 if assembly_provider["name"] and assembly_provider["url"]:
204 added_sequence["assembly_provider"] = assembly_provider
206 # annotation provider
207 annotation_provider = {
208 "name": attribs.get("added_seq_ann_pr_nam"),
209 "url": attribs.get("added_seq_ann_pr_url"),
210 }
211 if annotation_provider["name"] and annotation_provider["url"]:
212 added_sequence["annotation_provider"] = annotation_provider
214 return added_sequence
217def get_seq_regions(session: Session, external_db_map: dict) -> list[SeqRegion]:
218 """Returns all the sequence regions from the current core database.
220 Include synonyms, attribs and karyotypes. Only the top level sequences are exported.
222 Args:
223 session: Session from the current core database.
224 external_db_map: Mapping of external_db names for the synonyms.
226 """
227 seq_regions = []
229 for coord_system in fetch_coord_systems(session):
230 logging.debug(f"Dump coord {coord_system.name}")
231 for seqr in fetch_seq_regions(session, coord_system):
232 seq_region: dict[str, Any] = {}
233 seq_region = {"name": seqr.name, "length": seqr.length}
234 synonyms = get_synonyms(seqr, external_db_map)
235 if synonyms:
236 seq_region["synonyms"] = synonyms
238 attribs = get_attribs_dict(seqr)
239 if not attribs or "toplevel" not in attribs:
240 # Skip seq_region without attribs or not toplevel
241 continue
242 add_attribs(seq_region, attribs)
244 karyotype = get_karyotype(seqr)
245 if karyotype:
246 seq_region["karyotype_bands"] = karyotype
248 added_seq = get_added_sequence(seqr)
249 if added_seq:
250 seq_region["added_sequence"] = added_seq
252 if "coord_system_level" not in seq_region:
253 seq_region["coord_system_level"] = coord_system.name
255 seq_regions.append(seq_region)
257 seq_regions = sorted(seq_regions, key=lambda seqr: (seqr["coord_system_level"], seqr["name"]))
258 return seq_regions
261def main() -> None:
262 """Main script entry-point."""
263 parser = ArgumentParser(
264 description="Fetch all the sequence regions from a core database and print them in JSON format."
265 )
266 parser.add_server_arguments(include_database=True)
267 parser.add_argument_src_path(
268 "--external_db_map", default=DEFAULT_EXTERNAL_DB_MAP.resolve(), help="File with external_db mapping"
269 )
270 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
271 parser.add_log_arguments(add_log_file=True)
272 args = parser.parse_args()
273 init_logging_with_args(args)
275 dbc = DBConnectionLite(args.url)
277 external_map_path = Path(args.external_db_map)
278 external_map = get_external_db_map(external_map_path)
280 with dbc.session_scope() as session:
281 seq_regions = get_seq_regions(session, external_map)
283 print(json.dumps(seq_regions, indent=2, sort_keys=True))