Coverage for src/python/ensembl/io/genomio/annotation/update_description.py: 92%

107 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-02-21 15:37 +0000

1# See the NOTICE file distributed with this work for additional information 

2# regarding copyright ownership. 

3# 

4# Licensed under the Apache License, Version 2.0 (the "License"); 

5# you may not use this file except in compliance with the License. 

6# You may obtain a copy of the License at 

7# 

8# http://www.apache.org/licenses/LICENSE-2.0 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, 

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

13# See the License for the specific language governing permissions and 

14# limitations under the License. 

15"""Update descriptions from a functional annotation file into a core database.""" 

16 

17__all__ = [ 

18 "get_core_data", 

19 "load_descriptions", 

20] 

21 

22import logging 

23from pathlib import Path 

24from typing import Any, Dict, List, Optional, Tuple 

25 

26from sqlalchemy.orm import Session 

27from sqlalchemy import and_, select 

28 

29import ensembl.io.genomio 

30from ensembl.core.models import Gene, Transcript, ObjectXref, Xref 

31from ensembl.io.genomio.utils import get_json 

32from ensembl.utils.argparse import ArgumentParser 

33from ensembl.utils.database import DBConnection 

34from ensembl.utils.logging import init_logging_with_args 

35 

36 

37FEAT_TABLE = { 

38 "gene": "gene", 

39 "mobile_element": "gene", 

40 "transcript": "transcript", 

41} 

42 

43FeatStruct = Tuple[str, str, str] 

44 

45 

46def get_core_data(session: Session, table: str, match_xrefs: bool = False) -> Dict[str, FeatStruct]: 

47 """Returns the table descriptions from a core database. 

48 

49 Args: 

50 session: Session open on a core database. 

51 table: "gene" or "transcript" table from the core database. 

52 match_xrefs: If the IDs do not match, try to match an Xref ID instead. 

53 """ 

54 

55 if table == "gene": 

56 stmt = ( 

57 select(Gene.gene_id, Gene.stable_id, Gene.description, Xref.dbprimary_acc) 

58 .select_from(Gene) 

59 .outerjoin( 

60 ObjectXref, 

61 and_(Gene.gene_id == ObjectXref.ensembl_id, ObjectXref.ensembl_object_type == "gene"), 

62 ) 

63 .outerjoin(Xref) 

64 ) 

65 elif table == "transcript": 

66 stmt = ( 

67 select(Transcript.transcript_id, Transcript.stable_id, Transcript.description, Xref.dbprimary_acc) 

68 .select_from(Transcript) 

69 .outerjoin( 

70 ObjectXref, 

71 and_( 

72 Transcript.transcript_id == ObjectXref.ensembl_id, 

73 ObjectXref.ensembl_object_type == "transcript", 

74 ), 

75 ) 

76 .outerjoin(Xref) 

77 ) 

78 else: 

79 raise ValueError(f"Table {table} is not supported") 

80 

81 feat_data = {} 

82 for row in session.execute(stmt): 

83 (feat_id, stable_id, desc, xref_name) = row 

84 feat_struct: FeatStruct = (feat_id, stable_id, desc) 

85 feat_data[stable_id.lower()] = feat_struct 

86 if match_xrefs and xref_name: 

87 feat_data[xref_name.lower()] = feat_struct 

88 

89 return feat_data 

90 

91 

92def load_descriptions( 

93 session: Session, 

94 func_file: Path, 

95 report: bool = False, 

96 do_update: bool = False, 

97 match_xrefs: bool = True, 

98) -> None: 

99 """Loads gene and transcript descriptions into a core database. 

100 

101 Args: 

102 session: Session open on a core database. 

103 func_file: JSON file with the annotation information. 

104 report: Print the mapping of changes to perform in the standard output. 

105 do_update: Actually update the core database. 

106 match_xrefs: If the IDs do not match, try to match an Xref ID instead. 

107 """ 

108 func = get_json(func_file) 

109 logging.info(f"{len(func)} annotations from {func_file}") 

110 table_to_update = {"gene": Gene, "transcript": Transcript} 

111 for table, mapped_table in table_to_update.items(): 

112 logging.info(f"Checking {table} descriptions") 

113 feat_func = [feat for feat in func if feat["object_type"] == table] 

114 logging.info(f"{len(feat_func)} {table} annotations from {func_file}") 

115 feat_data = get_core_data(session, table, match_xrefs) 

116 logging.info(f"Loaded {len(feat_data)} {table} data") 

117 

118 stats = { 

119 "not_supported": 0, 

120 "not_found": 0, 

121 "same": 0, 

122 "same_empty": 0, 

123 "empty_but_xref": 0, 

124 "to_update_replace": 0, 

125 "to_update_remove": 0, 

126 } 

127 # Compare, only keep the descriptions that have changed 

128 features_to_update = _get_features_to_update( 

129 table, feat_func, feat_data, stats, report=report, do_update=do_update, match_xrefs=match_xrefs 

130 ) 

131 

132 # Show stats for this feature type 

133 for stat, count in stats.items(): 

134 if count == 0: 

135 continue 

136 logging.info(f"{stat} = {count}") 

137 

138 if do_update: 

139 logging.info(f"Now updating {len(features_to_update)} rows...") 

140 session.bulk_update_mappings(mapped_table, features_to_update) 

141 session.commit() 

142 

143 

144def _get_cur_feat( 

145 feat_data: Dict[str, FeatStruct], new_feat: Dict[str, Any], match_xrefs: bool = False 

146) -> Optional[FeatStruct]: 

147 """Match a feature ID, synonyms or xrefs to a core stable ID and return the matching core feature. 

148 

149 Returns None if no match. 

150 """ 

151 # Match with the ID 

152 cur_feat = feat_data.get(new_feat["id"].lower()) 

153 

154 # Fall back to a synonym 

155 if not cur_feat and "synonyms" in new_feat: 

156 for syn in new_feat["synonyms"]: 

157 cur_feat = feat_data.get(syn.lower()) 

158 if cur_feat: 

159 break 

160 

161 # Fall back to an xref 

162 if not cur_feat and match_xrefs and "xrefs" in new_feat: 

163 for xref in new_feat["xrefs"]: 

164 cur_feat = feat_data.get(xref["id"].lower()) 

165 if cur_feat: 

166 break 

167 

168 return cur_feat 

169 

170 

171def _get_features_to_update( 

172 table: str, 

173 feat_func: List[Dict[str, Any]], 

174 feat_data: Dict[str, FeatStruct], 

175 stats: Dict[str, int], 

176 *, 

177 report: bool = False, 

178 do_update: bool = False, 

179 match_xrefs: bool = True, 

180) -> List[Dict[str, Any]]: 

181 """Checks a list of features and returns those whose description we want to update. 

182 

183 Args: 

184 table: "gene" or "transcript" table for the features. 

185 feat_func: The features to check. 

186 feat_data: The features in the database. 

187 stats: Record the number of features checked in different cases. 

188 report: Print a report line for each feature to standard output. 

189 do_update: Actually update the database. 

190 match_xrefs: Use xref IDs if feature ID does not match a feature in the database. 

191 

192 Returns: 

193 The list of features with their operation changed to update or insert. 

194 """ 

195 to_update = [] 

196 for new_feat in feat_func: 

197 cur_feat = _get_cur_feat(feat_data, new_feat, match_xrefs) 

198 

199 # No match in the end 

200 if not cur_feat: 

201 logging.debug(f"Not found: {table} '{new_feat['id']}'") 

202 stats["not_found"] += 1 

203 continue 

204 

205 # Prepare some data to compare 

206 new_stable_id = new_feat["id"] 

207 new_desc = new_feat.get("description", "") 

208 (row_id, cur_stable_id, cur_desc) = cur_feat 

209 

210 # No description: replace unless the current description is from an Xref 

211 if not cur_desc: 

212 cur_desc = "" 

213 if not new_desc: 

214 if cur_desc == "": 

215 stats["same_empty"] += 1 

216 continue 

217 if "[Source:" in cur_desc: 

218 stats["empty_but_xref"] += 1 

219 continue 

220 stats["to_update_remove"] += 1 

221 

222 # Compare the descriptions 

223 elif new_desc == cur_desc: 

224 stats["same"] += 1 

225 continue 

226 # At this point, we have a new description to update 

227 else: 

228 stats["to_update_replace"] += 1 

229 

230 # Directly print the mapping 

231 if report: 

232 line = (table, new_stable_id, cur_stable_id, cur_desc, new_desc) 

233 print("\t".join(line)) 

234 

235 # Add to the batch list of updates for the core db 

236 if do_update: 

237 update_key = f"{table}_id" 

238 to_update.append({update_key: row_id, "description": new_desc}) 

239 

240 return to_update 

241 

242 

243def main() -> None: 

244 """Main script entry-point.""" 

245 parser = ArgumentParser(description=__doc__) 

246 parser.add_server_arguments(include_database=True) 

247 parser.add_argument_src_path("--func_file", required=True, help="Input functional annotation JSON") 

248 parser.add_argument("--report", action="store_true", help="Show what change would be made") 

249 parser.add_argument("--update", action="store_true", help="Make the changes to the database") 

250 parser.add_argument( 

251 "--match_xrefs", action="store_true", help="Use xref IDs to match features if IDs do not work" 

252 ) 

253 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__) 

254 parser.add_log_arguments(add_log_file=True) 

255 args = parser.parse_args() 

256 init_logging_with_args(args) 

257 

258 dbc = DBConnection(args.url) 

259 with dbc.session_scope() as session: 

260 load_descriptions(session, args.func_file, args.report, args.update, match_xrefs=args.match_xrefs)