Coverage for src/python/ensembl/io/genomio/annotation/update_description.py: 92%
107 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
1# See the NOTICE file distributed with this work for additional information
2# regarding copyright ownership.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Update descriptions from a functional annotation file into a core database."""
17__all__ = [
18 "get_core_data",
19 "load_descriptions",
20]
22import logging
23from pathlib import Path
24from typing import Any, Dict, List, Optional, Tuple
26from sqlalchemy.orm import Session
27from sqlalchemy import and_, select
29import ensembl.io.genomio
30from ensembl.core.models import Gene, Transcript, ObjectXref, Xref
31from ensembl.io.genomio.utils import get_json
32from ensembl.utils.argparse import ArgumentParser
33from ensembl.utils.database import DBConnection
34from ensembl.utils.logging import init_logging_with_args
37FEAT_TABLE = {
38 "gene": "gene",
39 "mobile_element": "gene",
40 "transcript": "transcript",
41}
43FeatStruct = Tuple[str, str, str]
46def get_core_data(session: Session, table: str, match_xrefs: bool = False) -> Dict[str, FeatStruct]:
47 """Returns the table descriptions from a core database.
49 Args:
50 session: Session open on a core database.
51 table: "gene" or "transcript" table from the core database.
52 match_xrefs: If the IDs do not match, try to match an Xref ID instead.
53 """
55 if table == "gene":
56 stmt = (
57 select(Gene.gene_id, Gene.stable_id, Gene.description, Xref.dbprimary_acc)
58 .select_from(Gene)
59 .outerjoin(
60 ObjectXref,
61 and_(Gene.gene_id == ObjectXref.ensembl_id, ObjectXref.ensembl_object_type == "gene"),
62 )
63 .outerjoin(Xref)
64 )
65 elif table == "transcript":
66 stmt = (
67 select(Transcript.transcript_id, Transcript.stable_id, Transcript.description, Xref.dbprimary_acc)
68 .select_from(Transcript)
69 .outerjoin(
70 ObjectXref,
71 and_(
72 Transcript.transcript_id == ObjectXref.ensembl_id,
73 ObjectXref.ensembl_object_type == "transcript",
74 ),
75 )
76 .outerjoin(Xref)
77 )
78 else:
79 raise ValueError(f"Table {table} is not supported")
81 feat_data = {}
82 for row in session.execute(stmt):
83 (feat_id, stable_id, desc, xref_name) = row
84 feat_struct: FeatStruct = (feat_id, stable_id, desc)
85 feat_data[stable_id.lower()] = feat_struct
86 if match_xrefs and xref_name:
87 feat_data[xref_name.lower()] = feat_struct
89 return feat_data
92def load_descriptions(
93 session: Session,
94 func_file: Path,
95 report: bool = False,
96 do_update: bool = False,
97 match_xrefs: bool = True,
98) -> None:
99 """Loads gene and transcript descriptions into a core database.
101 Args:
102 session: Session open on a core database.
103 func_file: JSON file with the annotation information.
104 report: Print the mapping of changes to perform in the standard output.
105 do_update: Actually update the core database.
106 match_xrefs: If the IDs do not match, try to match an Xref ID instead.
107 """
108 func = get_json(func_file)
109 logging.info(f"{len(func)} annotations from {func_file}")
110 table_to_update = {"gene": Gene, "transcript": Transcript}
111 for table, mapped_table in table_to_update.items():
112 logging.info(f"Checking {table} descriptions")
113 feat_func = [feat for feat in func if feat["object_type"] == table]
114 logging.info(f"{len(feat_func)} {table} annotations from {func_file}")
115 feat_data = get_core_data(session, table, match_xrefs)
116 logging.info(f"Loaded {len(feat_data)} {table} data")
118 stats = {
119 "not_supported": 0,
120 "not_found": 0,
121 "same": 0,
122 "same_empty": 0,
123 "empty_but_xref": 0,
124 "to_update_replace": 0,
125 "to_update_remove": 0,
126 }
127 # Compare, only keep the descriptions that have changed
128 features_to_update = _get_features_to_update(
129 table, feat_func, feat_data, stats, report=report, do_update=do_update, match_xrefs=match_xrefs
130 )
132 # Show stats for this feature type
133 for stat, count in stats.items():
134 if count == 0:
135 continue
136 logging.info(f"{stat} = {count}")
138 if do_update:
139 logging.info(f"Now updating {len(features_to_update)} rows...")
140 session.bulk_update_mappings(mapped_table, features_to_update)
141 session.commit()
144def _get_cur_feat(
145 feat_data: Dict[str, FeatStruct], new_feat: Dict[str, Any], match_xrefs: bool = False
146) -> Optional[FeatStruct]:
147 """Match a feature ID, synonyms or xrefs to a core stable ID and return the matching core feature.
149 Returns None if no match.
150 """
151 # Match with the ID
152 cur_feat = feat_data.get(new_feat["id"].lower())
154 # Fall back to a synonym
155 if not cur_feat and "synonyms" in new_feat:
156 for syn in new_feat["synonyms"]:
157 cur_feat = feat_data.get(syn.lower())
158 if cur_feat:
159 break
161 # Fall back to an xref
162 if not cur_feat and match_xrefs and "xrefs" in new_feat:
163 for xref in new_feat["xrefs"]:
164 cur_feat = feat_data.get(xref["id"].lower())
165 if cur_feat:
166 break
168 return cur_feat
171def _get_features_to_update(
172 table: str,
173 feat_func: List[Dict[str, Any]],
174 feat_data: Dict[str, FeatStruct],
175 stats: Dict[str, int],
176 *,
177 report: bool = False,
178 do_update: bool = False,
179 match_xrefs: bool = True,
180) -> List[Dict[str, Any]]:
181 """Checks a list of features and returns those whose description we want to update.
183 Args:
184 table: "gene" or "transcript" table for the features.
185 feat_func: The features to check.
186 feat_data: The features in the database.
187 stats: Record the number of features checked in different cases.
188 report: Print a report line for each feature to standard output.
189 do_update: Actually update the database.
190 match_xrefs: Use xref IDs if feature ID does not match a feature in the database.
192 Returns:
193 The list of features with their operation changed to update or insert.
194 """
195 to_update = []
196 for new_feat in feat_func:
197 cur_feat = _get_cur_feat(feat_data, new_feat, match_xrefs)
199 # No match in the end
200 if not cur_feat:
201 logging.debug(f"Not found: {table} '{new_feat['id']}'")
202 stats["not_found"] += 1
203 continue
205 # Prepare some data to compare
206 new_stable_id = new_feat["id"]
207 new_desc = new_feat.get("description", "")
208 (row_id, cur_stable_id, cur_desc) = cur_feat
210 # No description: replace unless the current description is from an Xref
211 if not cur_desc:
212 cur_desc = ""
213 if not new_desc:
214 if cur_desc == "":
215 stats["same_empty"] += 1
216 continue
217 if "[Source:" in cur_desc:
218 stats["empty_but_xref"] += 1
219 continue
220 stats["to_update_remove"] += 1
222 # Compare the descriptions
223 elif new_desc == cur_desc:
224 stats["same"] += 1
225 continue
226 # At this point, we have a new description to update
227 else:
228 stats["to_update_replace"] += 1
230 # Directly print the mapping
231 if report:
232 line = (table, new_stable_id, cur_stable_id, cur_desc, new_desc)
233 print("\t".join(line))
235 # Add to the batch list of updates for the core db
236 if do_update:
237 update_key = f"{table}_id"
238 to_update.append({update_key: row_id, "description": new_desc})
240 return to_update
243def main() -> None:
244 """Main script entry-point."""
245 parser = ArgumentParser(description=__doc__)
246 parser.add_server_arguments(include_database=True)
247 parser.add_argument_src_path("--func_file", required=True, help="Input functional annotation JSON")
248 parser.add_argument("--report", action="store_true", help="Show what change would be made")
249 parser.add_argument("--update", action="store_true", help="Make the changes to the database")
250 parser.add_argument(
251 "--match_xrefs", action="store_true", help="Use xref IDs to match features if IDs do not work"
252 )
253 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
254 parser.add_log_arguments(add_log_file=True)
255 args = parser.parse_args()
256 init_logging_with_args(args)
258 dbc = DBConnection(args.url)
259 with dbc.session_scope() as session:
260 load_descriptions(session, args.func_file, args.report, args.update, match_xrefs=args.match_xrefs)