Coverage for src/python/ensembl/io/genomio/genome_stats/dump.py: 90%

71 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-02-21 15:37 +0000

1# See the NOTICE file distributed with this work for additional information 

2# regarding copyright ownership. 

3# 

4# Licensed under the Apache License, Version 2.0 (the "License"); 

5# you may not use this file except in compliance with the License. 

6# You may obtain a copy of the License at 

7# 

8# http://www.apache.org/licenses/LICENSE-2.0 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, 

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

13# See the License for the specific language governing permissions and 

14# limitations under the License. 

15"""Generates a JSON representation of the genome stats (assembly and annotation) from a core database.""" 

16 

17__all__ = ["StatsGenerator"] 

18 

19from dataclasses import dataclass 

20import json 

21from typing import Any, Dict 

22 

23from sqlalchemy import select, func 

24from sqlalchemy.orm import Session 

25 

26from ensembl.core.models import SeqRegionAttrib, AttribType, Gene, Transcript 

27import ensembl.io.genomio 

28from ensembl.io.genomio.database import DBConnectionLite 

29from ensembl.utils.argparse import ArgumentParser 

30from ensembl.utils.database import StrURL 

31from ensembl.utils.logging import init_logging_with_args 

32 

33 

34@dataclass 

35class StatsGenerator: 

36 """Interface to extract genome stats from a core database.""" 

37 

38 session: Session 

39 

40 def get_assembly_stats(self) -> Dict[str, Any]: 

41 """Returns a dict of stats about the assembly.""" 

42 stats = { 

43 "coord_system": self.get_attrib_counts("coord_system_tag"), 

44 "locations": self.get_attrib_counts("sequence_location"), 

45 "codon_table": self.get_attrib_counts("codon_table"), 

46 } 

47 # Special: rename supercontigs to scaffolds for homogeneity 

48 StatsGenerator._fix_scaffolds(stats) 

49 return stats 

50 

51 @staticmethod 

52 def _fix_scaffolds(stats: Dict[str, Any]) -> None: 

53 """Renames supercontigs to scaffolds in the provided stats. 

54 

55 If scaffolds are present already, nothing is done. 

56 

57 Args: 

58 stats: Statistics dictionary. 

59 

60 """ 

61 coords = stats.get("coord_system", {}) 

62 if "supercontig" in coords and "scaffold" not in coords: 

63 coords["scaffold"] = coords["supercontig"] 

64 del coords["supercontig"] 

65 

66 def get_attrib_counts(self, code: str) -> Dict[str, Any]: 

67 """Returns a dict of count for each value counted with the attrib_type code provided. 

68 

69 Args: 

70 code: Ensembl database attrib_type code. 

71 

72 """ 

73 seqs_st = ( 

74 select(SeqRegionAttrib.value, func.count()) # pylint: disable=not-callable 

75 .join(AttribType) 

76 .filter(AttribType.code == code) 

77 .group_by(SeqRegionAttrib.value) 

78 ) 

79 attributes = {} 

80 for row in self.session.execute(seqs_st): 

81 (attribute_name, count) = row 

82 attributes[attribute_name] = count 

83 return attributes 

84 

85 def get_annotation_stats(self) -> Dict[str, Any]: 

86 """Returns a dict of stats about the coordinate systems (number of biotypes, etc.).""" 

87 stats = { 

88 "genes": self.get_feature_stats(Gene), 

89 "transcripts": self.get_feature_stats(Transcript), 

90 } 

91 return stats 

92 

93 def get_biotypes(self, table: Any) -> Dict[str, int]: 

94 """Returns a dict of stats about the feature biotypes.""" 

95 # pylint: disable-next=not-callable 

96 seqs_st = select(table.biotype, func.count()).group_by(table.biotype) 

97 biotypes = {} 

98 for row in self.session.execute(seqs_st): 

99 (biotype, count) = row 

100 biotypes[biotype] = count 

101 return biotypes 

102 

103 def get_feature_stats(self, table: Any) -> Dict[str, int]: 

104 """Returns a dict of stats about a given feature.""" 

105 session = self.session 

106 totals_st = select(func.count()).select_from(table) # pylint: disable=not-callable 

107 (total,) = session.execute(totals_st).one() 

108 # pylint: disable-next=singleton-comparison,not-callable 

109 no_desc_st = select(func.count()).filter(table.description.is_(None)) 

110 (no_desc,) = session.execute(no_desc_st).one() 

111 # pylint: disable-next=not-callable 

112 xref_desc_st = select(func.count()).where(table.description.like("%[Source:%")) 

113 (xref_desc,) = session.execute(xref_desc_st).one() 

114 left_over = total - no_desc - xref_desc 

115 feat_stats = { 

116 "total": total, 

117 "biotypes": self.get_biotypes(table), 

118 "description": { 

119 "empty": no_desc, 

120 "source_xref": xref_desc, 

121 "normal": left_over, 

122 }, 

123 } 

124 return feat_stats 

125 

126 def get_genome_stats(self) -> Dict[str, Any]: 

127 """Returns a dict of stats about the assembly and annotation.""" 

128 genome_stats = { 

129 "assembly_stats": self.get_assembly_stats(), 

130 "annotation_stats": self.get_annotation_stats(), 

131 } 

132 return genome_stats 

133 

134 

135def dump_genome_stats(url: StrURL) -> Dict[str, Any]: 

136 """Returns JSON object containing the genome stats (assembly and annotation) of the given core database. 

137 

138 Args: 

139 url: Core database URL. 

140 

141 """ 

142 dbc = DBConnectionLite(url) 

143 with dbc.session_scope() as session: 

144 generator = StatsGenerator(session) 

145 genome_stats = generator.get_genome_stats() 

146 return genome_stats 

147 

148 

149def main() -> None: 

150 """Main script entry-point.""" 

151 parser = ArgumentParser(description=__doc__) 

152 parser.add_server_arguments(include_database=True) 

153 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__) 

154 parser.add_log_arguments(add_log_file=True) 

155 args = parser.parse_args() 

156 init_logging_with_args(args) 

157 

158 genome_stats = dump_genome_stats(args.url) 

159 print(json.dumps(genome_stats, indent=2, sort_keys=True))