Coverage for src/python/ensembl/io/genomio/genome_stats/compare.py: 91%
78 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
1# See the NOTICE file distributed with this work for additional information
2# regarding copyright ownership.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Tool set to compare genome statistic between NCBI datasets and Ensembl's core databases."""
17__all__ = ["stats_dict_cmp", "compare_assembly", "compare_annotation", "compare_stats", "compare_stats_files"]
19import json
20from os import PathLike
21import re
22from typing import Any, Dict
24import ensembl.io.genomio
25from ensembl.io.genomio.utils import get_json
26from ensembl.utils.argparse import ArgumentParser
27from ensembl.utils.logging import init_logging_with_args
30def stats_dict_cmp(ncbi: Dict[str, int], core: Dict[str, int]) -> Dict[str, Dict]:
31 """Compares both dictionaries and returns the similar and different elements between both.
33 The method assumes both dictionaries have the same set of keys. A key would be considered the
34 same if its value in both dictionaries is the same, but will only be included in the returned
35 dictionary if that value is different than 0.
37 Args:
38 ncbi: NCBI dataset statistics in key-value pairs.
39 core: Core database statistics in key-value pairs.
41 Returns:
42 A dictionary with 2 keys:
43 - "same": Pairs of key - value for those entries equal in both dictionaries.
44 - "different": Keys that differ, with values for "ncbi", "core", and "diff", i.e. their
45 difference represented as `core_value - ncbi_value`.
47 """
48 diff = {}
49 same = {}
50 for key, ncbi_count in ncbi.items():
51 core_count = core[key]
52 if ncbi_count == core_count:
53 if ncbi_count != 0:
54 same[key] = ncbi_count
55 else:
56 diff[key] = {"ncbi": ncbi_count, "core": core_count, "diff": core_count - ncbi_count}
57 comparison: Dict[str, Dict] = {}
58 if same:
59 comparison["same"] = same
60 if diff:
61 comparison["different"] = diff
62 return comparison
65def compare_assembly(ncbi: Dict[str, Any], core: Dict[str, Any]) -> Dict[str, Dict]:
66 """Extracts the assembly statistics and returns the comparison between both sources.
68 The assembly statistics compared are the number of: organella, chromosomes, scaffolds and contigs.
69 The last one is only included if NCBI's assembly is contig level.
71 Args:
72 ncbi: NCBI dataset assembly statistics.
73 core: Core database assembly statistics.
75 Returns:
76 The common statistics with their value and the statistics with different value, including NCBI'
77 and core database's values as well as their difference (`core_value - ncbi_value`).
79 """
80 # Prepare counts to be comparable to the NCBI stats
81 ncbi_main = ncbi.get("assembly_stats", {})
82 ncbi_info = ncbi.get("assembly_info", {})
83 ncbi_organella = ncbi.get("organelle_info", [])
85 # First count the organella
86 core_num_organella = 0
87 for loc, loc_count in core.get("locations", {}).items():
88 if loc != "nuclear_chromosome":
89 core_num_organella += loc_count
91 # Our core stats count Organella chromosomes, sanity check here
92 core_chr = core.get("coord_system", {}).get("chromosome", 0)
93 core_adjusted_chrs = 0
94 if core_chr:
95 core_adjusted_chrs = core_chr - core_num_organella
97 # Number of scaffolds from our core
98 core_num_scaffolds = core.get("coord_system", {}).get("scaffold", 0)
100 # NCBI includes the chromosomes in its scaffold count
101 core_adjusted_scaffolds = core_num_scaffolds + core_adjusted_chrs
103 # Compile the counts
104 ncbi_counts = {
105 "num_organella": len(ncbi_organella),
106 "num_chromosomes": ncbi_main.get("total_number_of_chromosomes", 0),
107 "num_scaffolds": ncbi_main.get("number_of_scaffolds", 0),
108 "num_contigs": ncbi_main.get("number_of_contigs", 0),
109 }
110 core_counts = {
111 "num_organella": core_num_organella,
112 "num_chromosomes": core_adjusted_chrs,
113 "num_scaffolds": core_adjusted_scaffolds,
114 "num_contigs": core.get("coord_system", {}).get("contig", 0),
115 }
117 # Only compare contigs if there are any in NCBI
118 if ncbi_info.get("assembly_level") != "Contig":
119 del ncbi_counts["num_contigs"]
120 del core_counts["num_contigs"]
122 return stats_dict_cmp(ncbi_counts, core_counts)
125def compare_annotation(ncbi: Dict[str, Any], core: Dict[str, Any]) -> Dict[str, Dict]:
126 """Extracts the annotation statistics and returns the comparison between both sources.
128 Annotation statistics compared:
129 - protein_coding
130 - pseudogene (all pseudogene biotypes)
131 - other (number of misc_RNA)
132 - total
134 Args:
135 ncbi: NCBI dataset annotation statistics.
136 core: Core database annotation statistics.
138 Returns:
139 The common statistics with their value and the statistics with different value, including NCBI'
140 and core database's values as well as their difference (`core_value - ncbi_value`).
142 """
143 ncbi_counts = {
144 "protein_coding": ncbi.get("protein_coding", 0),
145 "pseudogene": ncbi.get("pseudogene", 0),
146 "total_genes": ncbi.get("total", 0),
147 "other": ncbi.get("other", 0),
148 }
150 # Prepare core database counts to be comparable
151 core_biotypes = core.get("genes", {}).get("biotypes", {})
153 # Add all pseudogenes
154 num_pseudogenes = 0
155 for name, num in core_biotypes.items():
156 if re.match(".*pseudogen.*", name):
157 num_pseudogenes += num
159 # Other genes such as misc_mRNA
160 num_others = core_biotypes.get("misc_RNA", 0)
162 core_counts = {
163 "protein_coding": core_biotypes.get("protein_coding", 0),
164 "pseudogene": num_pseudogenes,
165 "total_genes": core.get("genes", {}).get("total", 0),
166 "other": num_others,
167 }
169 return stats_dict_cmp(ncbi_counts, core_counts)
172def compare_stats(ncbi: Dict[str, Any], core: Dict[str, Any]) -> Dict[str, Dict]:
173 """Compares the genome statistics between an NCBI dataset and a core database.
175 Args:
176 ncbi: NCBI dataset genome statistics.
177 core: Core database genome statistics.
179 Returns:
180 The common statistics with their value and the statistics with different value, including NCBI'
181 and core database's values as well as their difference (`core_value - ncbi_value`), for the
182 assembly and annotation (if present in one of the sources) under "assembly_diff" and
183 "annotation_diff" keys, respectively.
185 """
186 ncbi_annotation_stats = ncbi.get("annotation_info", {}).get("stats", {}).get("gene_counts", {})
187 core_assembly_stats = core.get("assembly_stats", {})
188 core_annotation_stats = core.get("annotation_stats", {})
190 comp: Dict[str, Dict] = {
191 "assembly_diff": compare_assembly(ncbi, core_assembly_stats),
192 }
193 if core_annotation_stats or ncbi_annotation_stats:
194 comp["annotation_diff"] = compare_annotation(ncbi_annotation_stats, core_annotation_stats)
195 return comp
198def compare_stats_files(ncbi_file: PathLike, core_file: PathLike) -> Dict[str, Dict]:
199 """Compares the genome statistics between an NCBI dataset and a core database.
201 Args:
202 ncbi_file: NCBI dataset genome statistics JSON file.
203 core_file: Core database genome statistics JSON file.
205 Returns:
206 The common statistics with their value and the statistics with different value, including NCBI'
207 and core database's values as well as their difference (`core_value - ncbi_value`), for the
208 assembly and annotation (if present in one of the sources) under "assembly_diff" and
209 "annotation_diff" keys, respectively.
211 """
212 ncbi_stats = {}
213 ncbi_stats = get_json(ncbi_file)["reports"][0]
214 core_stats = get_json(core_file)
215 all_stats = compare_stats(ncbi_stats, core_stats)
216 return all_stats
219def main() -> None:
220 """Main script entry-point."""
221 parser = ArgumentParser(
222 description="Compares the genome statistics between an NCBI dataset and a core database."
223 )
224 parser.add_argument_src_path("--ncbi_stats", required=True, help="NCBI dataset stats JSON file")
225 parser.add_argument_src_path("--core_stats", required=True, help="core database stats JSON file")
226 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
227 parser.add_log_arguments(add_log_file=True)
228 args = parser.parse_args()
230 # Configure and initialise logging
231 init_logging_with_args(args)
233 report = compare_stats_files(args.ncbi_stats, args.core_stats)
234 print(json.dumps(report, indent=2, sort_keys=True))