Coverage for src/python/ensembl/io/genomio/genome_stats/compare.py: 91%

78 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-02-21 15:37 +0000

1# See the NOTICE file distributed with this work for additional information 

2# regarding copyright ownership. 

3# 

4# Licensed under the Apache License, Version 2.0 (the "License"); 

5# you may not use this file except in compliance with the License. 

6# You may obtain a copy of the License at 

7# 

8# http://www.apache.org/licenses/LICENSE-2.0 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, 

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

13# See the License for the specific language governing permissions and 

14# limitations under the License. 

15"""Tool set to compare genome statistic between NCBI datasets and Ensembl's core databases.""" 

16 

17__all__ = ["stats_dict_cmp", "compare_assembly", "compare_annotation", "compare_stats", "compare_stats_files"] 

18 

19import json 

20from os import PathLike 

21import re 

22from typing import Any, Dict 

23 

24import ensembl.io.genomio 

25from ensembl.io.genomio.utils import get_json 

26from ensembl.utils.argparse import ArgumentParser 

27from ensembl.utils.logging import init_logging_with_args 

28 

29 

30def stats_dict_cmp(ncbi: Dict[str, int], core: Dict[str, int]) -> Dict[str, Dict]: 

31 """Compares both dictionaries and returns the similar and different elements between both. 

32 

33 The method assumes both dictionaries have the same set of keys. A key would be considered the 

34 same if its value in both dictionaries is the same, but will only be included in the returned 

35 dictionary if that value is different than 0. 

36 

37 Args: 

38 ncbi: NCBI dataset statistics in key-value pairs. 

39 core: Core database statistics in key-value pairs. 

40 

41 Returns: 

42 A dictionary with 2 keys: 

43 - "same": Pairs of key - value for those entries equal in both dictionaries. 

44 - "different": Keys that differ, with values for "ncbi", "core", and "diff", i.e. their 

45 difference represented as `core_value - ncbi_value`. 

46 

47 """ 

48 diff = {} 

49 same = {} 

50 for key, ncbi_count in ncbi.items(): 

51 core_count = core[key] 

52 if ncbi_count == core_count: 

53 if ncbi_count != 0: 

54 same[key] = ncbi_count 

55 else: 

56 diff[key] = {"ncbi": ncbi_count, "core": core_count, "diff": core_count - ncbi_count} 

57 comparison: Dict[str, Dict] = {} 

58 if same: 

59 comparison["same"] = same 

60 if diff: 

61 comparison["different"] = diff 

62 return comparison 

63 

64 

65def compare_assembly(ncbi: Dict[str, Any], core: Dict[str, Any]) -> Dict[str, Dict]: 

66 """Extracts the assembly statistics and returns the comparison between both sources. 

67 

68 The assembly statistics compared are the number of: organella, chromosomes, scaffolds and contigs. 

69 The last one is only included if NCBI's assembly is contig level. 

70 

71 Args: 

72 ncbi: NCBI dataset assembly statistics. 

73 core: Core database assembly statistics. 

74 

75 Returns: 

76 The common statistics with their value and the statistics with different value, including NCBI' 

77 and core database's values as well as their difference (`core_value - ncbi_value`). 

78 

79 """ 

80 # Prepare counts to be comparable to the NCBI stats 

81 ncbi_main = ncbi.get("assembly_stats", {}) 

82 ncbi_info = ncbi.get("assembly_info", {}) 

83 ncbi_organella = ncbi.get("organelle_info", []) 

84 

85 # First count the organella 

86 core_num_organella = 0 

87 for loc, loc_count in core.get("locations", {}).items(): 

88 if loc != "nuclear_chromosome": 

89 core_num_organella += loc_count 

90 

91 # Our core stats count Organella chromosomes, sanity check here 

92 core_chr = core.get("coord_system", {}).get("chromosome", 0) 

93 core_adjusted_chrs = 0 

94 if core_chr: 

95 core_adjusted_chrs = core_chr - core_num_organella 

96 

97 # Number of scaffolds from our core 

98 core_num_scaffolds = core.get("coord_system", {}).get("scaffold", 0) 

99 

100 # NCBI includes the chromosomes in its scaffold count 

101 core_adjusted_scaffolds = core_num_scaffolds + core_adjusted_chrs 

102 

103 # Compile the counts 

104 ncbi_counts = { 

105 "num_organella": len(ncbi_organella), 

106 "num_chromosomes": ncbi_main.get("total_number_of_chromosomes", 0), 

107 "num_scaffolds": ncbi_main.get("number_of_scaffolds", 0), 

108 "num_contigs": ncbi_main.get("number_of_contigs", 0), 

109 } 

110 core_counts = { 

111 "num_organella": core_num_organella, 

112 "num_chromosomes": core_adjusted_chrs, 

113 "num_scaffolds": core_adjusted_scaffolds, 

114 "num_contigs": core.get("coord_system", {}).get("contig", 0), 

115 } 

116 

117 # Only compare contigs if there are any in NCBI 

118 if ncbi_info.get("assembly_level") != "Contig": 

119 del ncbi_counts["num_contigs"] 

120 del core_counts["num_contigs"] 

121 

122 return stats_dict_cmp(ncbi_counts, core_counts) 

123 

124 

125def compare_annotation(ncbi: Dict[str, Any], core: Dict[str, Any]) -> Dict[str, Dict]: 

126 """Extracts the annotation statistics and returns the comparison between both sources. 

127 

128 Annotation statistics compared: 

129 - protein_coding 

130 - pseudogene (all pseudogene biotypes) 

131 - other (number of misc_RNA) 

132 - total 

133 

134 Args: 

135 ncbi: NCBI dataset annotation statistics. 

136 core: Core database annotation statistics. 

137 

138 Returns: 

139 The common statistics with their value and the statistics with different value, including NCBI' 

140 and core database's values as well as their difference (`core_value - ncbi_value`). 

141 

142 """ 

143 ncbi_counts = { 

144 "protein_coding": ncbi.get("protein_coding", 0), 

145 "pseudogene": ncbi.get("pseudogene", 0), 

146 "total_genes": ncbi.get("total", 0), 

147 "other": ncbi.get("other", 0), 

148 } 

149 

150 # Prepare core database counts to be comparable 

151 core_biotypes = core.get("genes", {}).get("biotypes", {}) 

152 

153 # Add all pseudogenes 

154 num_pseudogenes = 0 

155 for name, num in core_biotypes.items(): 

156 if re.match(".*pseudogen.*", name): 

157 num_pseudogenes += num 

158 

159 # Other genes such as misc_mRNA 

160 num_others = core_biotypes.get("misc_RNA", 0) 

161 

162 core_counts = { 

163 "protein_coding": core_biotypes.get("protein_coding", 0), 

164 "pseudogene": num_pseudogenes, 

165 "total_genes": core.get("genes", {}).get("total", 0), 

166 "other": num_others, 

167 } 

168 

169 return stats_dict_cmp(ncbi_counts, core_counts) 

170 

171 

172def compare_stats(ncbi: Dict[str, Any], core: Dict[str, Any]) -> Dict[str, Dict]: 

173 """Compares the genome statistics between an NCBI dataset and a core database. 

174 

175 Args: 

176 ncbi: NCBI dataset genome statistics. 

177 core: Core database genome statistics. 

178 

179 Returns: 

180 The common statistics with their value and the statistics with different value, including NCBI' 

181 and core database's values as well as their difference (`core_value - ncbi_value`), for the 

182 assembly and annotation (if present in one of the sources) under "assembly_diff" and 

183 "annotation_diff" keys, respectively. 

184 

185 """ 

186 ncbi_annotation_stats = ncbi.get("annotation_info", {}).get("stats", {}).get("gene_counts", {}) 

187 core_assembly_stats = core.get("assembly_stats", {}) 

188 core_annotation_stats = core.get("annotation_stats", {}) 

189 

190 comp: Dict[str, Dict] = { 

191 "assembly_diff": compare_assembly(ncbi, core_assembly_stats), 

192 } 

193 if core_annotation_stats or ncbi_annotation_stats: 

194 comp["annotation_diff"] = compare_annotation(ncbi_annotation_stats, core_annotation_stats) 

195 return comp 

196 

197 

198def compare_stats_files(ncbi_file: PathLike, core_file: PathLike) -> Dict[str, Dict]: 

199 """Compares the genome statistics between an NCBI dataset and a core database. 

200 

201 Args: 

202 ncbi_file: NCBI dataset genome statistics JSON file. 

203 core_file: Core database genome statistics JSON file. 

204 

205 Returns: 

206 The common statistics with their value and the statistics with different value, including NCBI' 

207 and core database's values as well as their difference (`core_value - ncbi_value`), for the 

208 assembly and annotation (if present in one of the sources) under "assembly_diff" and 

209 "annotation_diff" keys, respectively. 

210 

211 """ 

212 ncbi_stats = {} 

213 ncbi_stats = get_json(ncbi_file)["reports"][0] 

214 core_stats = get_json(core_file) 

215 all_stats = compare_stats(ncbi_stats, core_stats) 

216 return all_stats 

217 

218 

219def main() -> None: 

220 """Main script entry-point.""" 

221 parser = ArgumentParser( 

222 description="Compares the genome statistics between an NCBI dataset and a core database." 

223 ) 

224 parser.add_argument_src_path("--ncbi_stats", required=True, help="NCBI dataset stats JSON file") 

225 parser.add_argument_src_path("--core_stats", required=True, help="core database stats JSON file") 

226 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__) 

227 parser.add_log_arguments(add_log_file=True) 

228 args = parser.parse_args() 

229 

230 # Configure and initialise logging 

231 init_logging_with_args(args) 

232 

233 report = compare_stats_files(args.ncbi_stats, args.core_stats) 

234 print(json.dumps(report, indent=2, sort_keys=True))