Coverage for src/python/ensembl/io/genomio/genome

1# See the NOTICE file distributed with this work for additional information

2# regarding copyright ownership.

4# Licensed under the Apache License, Version 2.0 (the "License");

5# you may not use this file except in compliance with the License.

6# You may obtain a copy of the License at

8# http://www.apache.org/licenses/LICENSE-2.0

10# Unless required by applicable law or agreed to in writing, software

11# distributed under the License is distributed on an "AS IS" BASIS,

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13# See the License for the specific language governing permissions and

14# limitations under the License.

15"""Tool set to compare genome statistic between NCBI datasets and Ensembl's core databases."""

17__all__ = ["stats_dict_cmp", "compare_assembly", "compare_annotation", "compare_stats", "compare_stats_files"]

19import json

20from os import PathLike

21import re

22from typing import Any, Dict

24import ensembl.io.genomio

25from ensembl.io.genomio.utils import get_json

26from ensembl.utils.argparse import ArgumentParser

27from ensembl.utils.logging import init_logging_with_args

30def stats_dict_cmp(ncbi: Dict[str, int], core: Dict[str, int]) -> Dict[str, Dict]:

31 """Compares both dictionaries and returns the similar and different elements between both.

33 The method assumes both dictionaries have the same set of keys. A key would be considered the

34 same if its value in both dictionaries is the same, but will only be included in the returned

35 dictionary if that value is different than 0.

37 Args:

38 ncbi: NCBI dataset statistics in key-value pairs.

39 core: Core database statistics in key-value pairs.

41 Returns:

42 A dictionary with 2 keys:

43 - "same": Pairs of key - value for those entries equal in both dictionaries.

44 - "different": Keys that differ, with values for "ncbi", "core", and "diff", i.e. their

45 difference represented as `core_value - ncbi_value`.

47 """

48 diff = {}

49 same = {}

50 for key, ncbi_count in ncbi.items():

51 core_count = core[key]

52 if ncbi_count == core_count:

53 if ncbi_count != 0:

54 same[key] = ncbi_count

55 else:

56 diff[key] = {"ncbi": ncbi_count, "core": core_count, "diff": core_count - ncbi_count}

57 comparison: Dict[str, Dict] = {}

58 if same:

59 comparison["same"] = same

60 if diff:

61 comparison["different"] = diff

62 return comparison

65def compare_assembly(ncbi: Dict[str, Any], core: Dict[str, Any]) -> Dict[str, Dict]:

66 """Extracts the assembly statistics and returns the comparison between both sources.

68 The assembly statistics compared are the number of: organella, chromosomes, scaffolds and contigs.

69 The last one is only included if NCBI's assembly is contig level.

71 Args:

72 ncbi: NCBI dataset assembly statistics.

73 core: Core database assembly statistics.

75 Returns:

76 The common statistics with their value and the statistics with different value, including NCBI'

77 and core database's values as well as their difference (`core_value - ncbi_value`).

79 """

80 # Prepare counts to be comparable to the NCBI stats

81 ncbi_main = ncbi.get("assembly_stats", {})

82 ncbi_info = ncbi.get("assembly_info", {})

83 ncbi_organella = ncbi.get("organelle_info", [])

85 # First count the organella

86 core_num_organella = 0

87 for loc, loc_count in core.get("locations", {}).items():

88 if loc != "nuclear_chromosome":

89 core_num_organella += loc_count

91 # Our core stats count Organella chromosomes, sanity check here

92 core_chr = core.get("coord_system", {}).get("chromosome", 0)

93 core_adjusted_chrs = 0

94 if core_chr:

95 core_adjusted_chrs = core_chr - core_num_organella

97 # Number of scaffolds from our core

98 core_num_scaffolds = core.get("coord_system", {}).get("scaffold", 0)

100 # NCBI includes the chromosomes in its scaffold count

101 core_adjusted_scaffolds = core_num_scaffolds + core_adjusted_chrs

102

103 # Compile the counts

104 ncbi_counts = {

105 "num_organella": len(ncbi_organella),

106 "num_chromosomes": ncbi_main.get("total_number_of_chromosomes", 0),

107 "num_scaffolds": ncbi_main.get("number_of_scaffolds", 0),

108 "num_contigs": ncbi_main.get("number_of_contigs", 0),

109 }

110 core_counts = {

111 "num_organella": core_num_organella,

112 "num_chromosomes": core_adjusted_chrs,

113 "num_scaffolds": core_adjusted_scaffolds,

114 "num_contigs": core.get("coord_system", {}).get("contig", 0),

115 }

116

117 # Only compare contigs if there are any in NCBI

118 if ncbi_info.get("assembly_level") != "Contig":

119 del ncbi_counts["num_contigs"]

120 del core_counts["num_contigs"]

121

122 return stats_dict_cmp(ncbi_counts, core_counts)

123

124

125def compare_annotation(ncbi: Dict[str, Any], core: Dict[str, Any]) -> Dict[str, Dict]:

126 """Extracts the annotation statistics and returns the comparison between both sources.

127

128 Annotation statistics compared:

129 - protein_coding

130 - pseudogene (all pseudogene biotypes)

131 - other (number of misc_RNA)

132 - total

133

134 Args:

135 ncbi: NCBI dataset annotation statistics.

136 core: Core database annotation statistics.

137

138 Returns:

139 The common statistics with their value and the statistics with different value, including NCBI'

140 and core database's values as well as their difference (`core_value - ncbi_value`).

141

142 """

143 ncbi_counts = {

144 "protein_coding": ncbi.get("protein_coding", 0),

145 "pseudogene": ncbi.get("pseudogene", 0),

146 "total_genes": ncbi.get("total", 0),

147 "other": ncbi.get("other", 0),

148 }

149

150 # Prepare core database counts to be comparable

151 core_biotypes = core.get("genes", {}).get("biotypes", {})

152

153 # Add all pseudogenes

154 num_pseudogenes = 0

155 for name, num in core_biotypes.items():

156 if re.match(".*pseudogen.*", name):

157 num_pseudogenes += num

158

159 # Other genes such as misc_mRNA

160 num_others = core_biotypes.get("misc_RNA", 0)

161

162 core_counts = {

163 "protein_coding": core_biotypes.get("protein_coding", 0),

164 "pseudogene": num_pseudogenes,

165 "total_genes": core.get("genes", {}).get("total", 0),

166 "other": num_others,

167 }

168

169 return stats_dict_cmp(ncbi_counts, core_counts)

170

171

172def compare_stats(ncbi: Dict[str, Any], core: Dict[str, Any]) -> Dict[str, Dict]:

173 """Compares the genome statistics between an NCBI dataset and a core database.

174

175 Args:

176 ncbi: NCBI dataset genome statistics.

177 core: Core database genome statistics.

178

179 Returns:

180 The common statistics with their value and the statistics with different value, including NCBI'

181 and core database's values as well as their difference (`core_value - ncbi_value`), for the

182 assembly and annotation (if present in one of the sources) under "assembly_diff" and

183 "annotation_diff" keys, respectively.

184

185 """

186 ncbi_annotation_stats = ncbi.get("annotation_info", {}).get("stats", {}).get("gene_counts", {})

187 core_assembly_stats = core.get("assembly_stats", {})

188 core_annotation_stats = core.get("annotation_stats", {})

189

190 comp: Dict[str, Dict] = {

191 "assembly_diff": compare_assembly(ncbi, core_assembly_stats),

192 }

193 if core_annotation_stats or ncbi_annotation_stats:

194 comp["annotation_diff"] = compare_annotation(ncbi_annotation_stats, core_annotation_stats)

195 return comp

196

197

198def compare_stats_files(ncbi_file: PathLike, core_file: PathLike) -> Dict[str, Dict]:

199 """Compares the genome statistics between an NCBI dataset and a core database.

200

201 Args:

202 ncbi_file: NCBI dataset genome statistics JSON file.

203 core_file: Core database genome statistics JSON file.

204

205 Returns:

206 The common statistics with their value and the statistics with different value, including NCBI'

207 and core database's values as well as their difference (`core_value - ncbi_value`), for the

208 assembly and annotation (if present in one of the sources) under "assembly_diff" and

209 "annotation_diff" keys, respectively.

210

211 """

212 ncbi_stats = {}

213 ncbi_stats = get_json(ncbi_file)["reports"][0]

214 core_stats = get_json(core_file)

215 all_stats = compare_stats(ncbi_stats, core_stats)

216 return all_stats

217

218

219def main() -> None:

220 """Main script entry-point."""

221 parser = ArgumentParser(

222 description="Compares the genome statistics between an NCBI dataset and a core database."

223 )

224 parser.add_argument_src_path("--ncbi_stats", required=True, help="NCBI dataset stats JSON file")

225 parser.add_argument_src_path("--core_stats", required=True, help="core database stats JSON file")

226 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)

227 parser.add_log_arguments(add_log_file=True)

228 args = parser.parse_args()

229

230 # Configure and initialise logging

231 init_logging_with_args(args)

232

233 report = compare_stats_files(args.ncbi_stats, args.core_stats)

234 print(json.dumps(report, indent=2, sort_keys=True))

Coverage for src/python/ensembl/io/genomio/genome_stats/compare.py: 91%

78 statements