Coverage for src/python/ensembl/io/genomio/genome_stats/dump.py: 90%
71 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
1# See the NOTICE file distributed with this work for additional information
2# regarding copyright ownership.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Generates a JSON representation of the genome stats (assembly and annotation) from a core database."""
17__all__ = ["StatsGenerator"]
19from dataclasses import dataclass
20import json
21from typing import Any, Dict
23from sqlalchemy import select, func
24from sqlalchemy.orm import Session
26from ensembl.core.models import SeqRegionAttrib, AttribType, Gene, Transcript
27import ensembl.io.genomio
28from ensembl.io.genomio.database import DBConnectionLite
29from ensembl.utils.argparse import ArgumentParser
30from ensembl.utils.database import StrURL
31from ensembl.utils.logging import init_logging_with_args
34@dataclass
35class StatsGenerator:
36 """Interface to extract genome stats from a core database."""
38 session: Session
40 def get_assembly_stats(self) -> Dict[str, Any]:
41 """Returns a dict of stats about the assembly."""
42 stats = {
43 "coord_system": self.get_attrib_counts("coord_system_tag"),
44 "locations": self.get_attrib_counts("sequence_location"),
45 "codon_table": self.get_attrib_counts("codon_table"),
46 }
47 # Special: rename supercontigs to scaffolds for homogeneity
48 StatsGenerator._fix_scaffolds(stats)
49 return stats
51 @staticmethod
52 def _fix_scaffolds(stats: Dict[str, Any]) -> None:
53 """Renames supercontigs to scaffolds in the provided stats.
55 If scaffolds are present already, nothing is done.
57 Args:
58 stats: Statistics dictionary.
60 """
61 coords = stats.get("coord_system", {})
62 if "supercontig" in coords and "scaffold" not in coords:
63 coords["scaffold"] = coords["supercontig"]
64 del coords["supercontig"]
66 def get_attrib_counts(self, code: str) -> Dict[str, Any]:
67 """Returns a dict of count for each value counted with the attrib_type code provided.
69 Args:
70 code: Ensembl database attrib_type code.
72 """
73 seqs_st = (
74 select(SeqRegionAttrib.value, func.count()) # pylint: disable=not-callable
75 .join(AttribType)
76 .filter(AttribType.code == code)
77 .group_by(SeqRegionAttrib.value)
78 )
79 attributes = {}
80 for row in self.session.execute(seqs_st):
81 (attribute_name, count) = row
82 attributes[attribute_name] = count
83 return attributes
85 def get_annotation_stats(self) -> Dict[str, Any]:
86 """Returns a dict of stats about the coordinate systems (number of biotypes, etc.)."""
87 stats = {
88 "genes": self.get_feature_stats(Gene),
89 "transcripts": self.get_feature_stats(Transcript),
90 }
91 return stats
93 def get_biotypes(self, table: Any) -> Dict[str, int]:
94 """Returns a dict of stats about the feature biotypes."""
95 # pylint: disable-next=not-callable
96 seqs_st = select(table.biotype, func.count()).group_by(table.biotype)
97 biotypes = {}
98 for row in self.session.execute(seqs_st):
99 (biotype, count) = row
100 biotypes[biotype] = count
101 return biotypes
103 def get_feature_stats(self, table: Any) -> Dict[str, int]:
104 """Returns a dict of stats about a given feature."""
105 session = self.session
106 totals_st = select(func.count()).select_from(table) # pylint: disable=not-callable
107 (total,) = session.execute(totals_st).one()
108 # pylint: disable-next=singleton-comparison,not-callable
109 no_desc_st = select(func.count()).filter(table.description.is_(None))
110 (no_desc,) = session.execute(no_desc_st).one()
111 # pylint: disable-next=not-callable
112 xref_desc_st = select(func.count()).where(table.description.like("%[Source:%"))
113 (xref_desc,) = session.execute(xref_desc_st).one()
114 left_over = total - no_desc - xref_desc
115 feat_stats = {
116 "total": total,
117 "biotypes": self.get_biotypes(table),
118 "description": {
119 "empty": no_desc,
120 "source_xref": xref_desc,
121 "normal": left_over,
122 },
123 }
124 return feat_stats
126 def get_genome_stats(self) -> Dict[str, Any]:
127 """Returns a dict of stats about the assembly and annotation."""
128 genome_stats = {
129 "assembly_stats": self.get_assembly_stats(),
130 "annotation_stats": self.get_annotation_stats(),
131 }
132 return genome_stats
135def dump_genome_stats(url: StrURL) -> Dict[str, Any]:
136 """Returns JSON object containing the genome stats (assembly and annotation) of the given core database.
138 Args:
139 url: Core database URL.
141 """
142 dbc = DBConnectionLite(url)
143 with dbc.session_scope() as session:
144 generator = StatsGenerator(session)
145 genome_stats = generator.get_genome_stats()
146 return genome_stats
149def main() -> None:
150 """Main script entry-point."""
151 parser = ArgumentParser(description=__doc__)
152 parser.add_server_arguments(include_database=True)
153 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
154 parser.add_log_arguments(add_log_file=True)
155 args = parser.parse_args()
156 init_logging_with_args(args)
158 genome_stats = dump_genome_stats(args.url)
159 print(json.dumps(genome_stats, indent=2, sort_keys=True))