Coverage for src/python/ensembl/io/genomio/gff3/restructure.py: 99%

135 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-02-21 15:37 +0000

1# See the NOTICE file distributed with this work for additional information 

2# regarding copyright ownership. 

3# 

4# Licensed under the Apache License, Version 2.0 (the "License"); 

5# you may not use this file except in compliance with the License. 

6# You may obtain a copy of the License at 

7# 

8# http://www.apache.org/licenses/LICENSE-2.0 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, 

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

13# See the License for the specific language governing permissions and 

14# limitations under the License. 

15"""Restructure a gene model to a standard representation: `gene -> [ mRNAs -> [CDSs, exons] ]`""" 

16 

17__all__ = [ 

18 "restructure_gene", 

19 "add_transcript_to_naked_gene", 

20 "move_only_cdss_to_new_mrna", 

21 "move_only_exons_to_new_mrna", 

22 "move_cds_to_existing_mrna", 

23 "remove_extra_exons", 

24 "remove_cds_from_pseudogene", 

25] 

26 

27from collections import Counter 

28import logging 

29from typing import List 

30 

31from .exceptions import GFFParserError 

32from .features import GFFSeqFeature 

33 

34 

35def _get_feat_counts(gene: GFFSeqFeature) -> Counter: 

36 return Counter([feat.type for feat in gene.sub_features]) 

37 

38 

39def restructure_gene(gene: GFFSeqFeature) -> None: 

40 """Standardize the structure of a gene model: 

41 - Add a transcript if there are no children 

42 - Move the CDS and exons to an mRNA if they are directly under the gene 

43 

44 Args: 

45 gene: Gene feature to restructure. 

46 

47 Raises: 

48 GFFParserError: If there are CDSs/exons remaining under the gene after applying the fixes. 

49 """ 

50 # Skip if the children of the gene look ok 

51 counts = _get_feat_counts(gene) 

52 if (len(counts) > 0) and not counts.get("CDS") and not counts.get("exon"): 

53 return 

54 

55 # Make sure the gene has a transcript if nothing else 

56 add_transcript_to_naked_gene(gene) 

57 

58 # Corrections if there are CDSs or exons directly under the gene level 

59 move_only_cdss_to_new_mrna(gene) 

60 move_only_exons_to_new_mrna(gene) 

61 move_cds_to_existing_mrna(gene) 

62 remove_extra_exons(gene) 

63 

64 # Check again after fixes that no CDS or exon remain under the gene 

65 counts = _get_feat_counts(gene) 

66 if counts.get("CDS") or counts.get("exon"): 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true

67 raise GFFParserError(f"Gene {gene.id} contains direct CDSs and exons children") 

68 

69 

70def add_transcript_to_naked_gene(gene: GFFSeqFeature) -> None: 

71 """Add an unspecific transcript to a gene without any sub features.""" 

72 

73 if (len(gene.sub_features) > 0) or (gene.type != "gene"): 

74 return 

75 

76 transcript = GFFSeqFeature(gene.location, type="transcript") 

77 transcript.qualifiers["source"] = gene.qualifiers["source"] 

78 gene.sub_features = [transcript] 

79 logging.debug(f"Inserted 1 transcript for a lone gene {gene.id}") 

80 

81 

82def move_only_cdss_to_new_mrna(gene: GFFSeqFeature) -> None: 

83 """Add intermediate mRNAs to a gene with only CDS children. 

84 Do nothing if some sub-features are not CDS. 

85 """ 

86 

87 counts = _get_feat_counts(gene) 

88 if (len(counts) != 1) or not counts.get("CDS"): 

89 return 

90 

91 transcripts_dict = {} 

92 

93 for cds in gene.sub_features: 

94 # We create as many transcripts as there are different CDS IDs 

95 if cds.id not in transcripts_dict: 

96 logging.debug(f"Create a new mRNA for {cds.id}") 

97 transcript = GFFSeqFeature(gene.location, type="mRNA") 

98 transcript.qualifiers["source"] = gene.qualifiers["source"] 

99 transcripts_dict[cds.id] = transcript 

100 

101 # Add the CDS to the transcript 

102 transcripts_dict[cds.id].sub_features.append(cds) 

103 

104 # Also add an exon in the same location 

105 exon = GFFSeqFeature(cds.location, type="exon") 

106 exon.qualifiers["source"] = gene.qualifiers["source"] 

107 transcripts_dict[cds.id].sub_features.append(exon) 

108 

109 transcripts = list(transcripts_dict.values()) 

110 gene.sub_features = transcripts 

111 

112 logging.debug(f"Insert transcript-exon feats for {gene.id} ({len(transcripts)} CDSs)") 

113 

114 

115def move_only_exons_to_new_mrna(gene: GFFSeqFeature) -> None: 

116 """Add an mRNA for a gene that only has exons and move the exons under the mRNA. 

117 No change if the gene has other sub_features than exon. 

118 """ 

119 

120 counts = _get_feat_counts(gene) 

121 if (len(counts) != 1) or not counts.get("exon"): 

122 return 

123 

124 transcript = GFFSeqFeature(gene.location, type="mRNA") 

125 transcript.qualifiers["source"] = gene.qualifiers["source"] 

126 transcript.sub_features = gene.sub_features 

127 gene.sub_features = [transcript] 

128 

129 logging.debug(f"Insert transcript for {gene.id} ({len(gene.sub_features)} exons)") 

130 

131 

132def move_cds_to_existing_mrna(gene: GFFSeqFeature) -> None: 

133 """Move CDS child features of a gene to the mRNA. 

134 

135 This is to fix the case where we have the following structure:: 

136 gene -> [ mRNA, CDSs ] 

137 

138 and change it to:: 

139 gene -> [ mRNA -> [ CDSs ] ] 

140 

141 The mRNA itself might have exons, in which case check that they match the CDS coordinates. 

142 

143 Args: 

144 gene: Gene feature to update. 

145 

146 Raises: 

147 GFFParserError: If the feature structure is not recognized. 

148 """ 

149 counts = _get_feat_counts(gene) 

150 if not counts.get("mRNA") or not counts.get("CDS"): 

151 return 

152 if counts["mRNA"] > 1: 

153 raise GFFParserError( 

154 f"Can't fix gene {gene.id}: contains several mRNAs and CDSs, all children of the gene" 

155 ) 

156 

157 # First, count the types 

158 mrnas = [] 

159 cdss = [] 

160 

161 gene_subf_clean = [] 

162 for subf in gene.sub_features: 

163 if subf.type == "mRNA": 

164 mrnas.append(subf) 

165 elif subf.type == "CDS": 

166 cdss.append(subf) 

167 else: 

168 gene_subf_clean.append(subf) 

169 

170 mrna = mrnas[0] 

171 

172 # Check if there are exons (or CDSs) under the mRNA 

173 sub_cdss = [] 

174 sub_exons = [] 

175 for subf in mrna.sub_features: 

176 if subf.type == "CDS": 

177 sub_cdss.append(subf) 

178 elif subf.type == "exon": 

179 sub_exons.append(subf) 

180 

181 # Check sub CDSs 

182 if sub_cdss: 

183 raise GFFParserError(f"Gene {gene.id} has CDSs as children in both the gene and mRNA") 

184 

185 # If there are exons, check they overlap with the CDSs 

186 _check_sub_exons(mrna, cdss, sub_exons) 

187 

188 # No more issues? Move the CDSs, and add any new exons 

189 mrna.sub_features += cdss 

190 # And remove them from the gene 

191 gene.sub_features = gene_subf_clean 

192 gene.sub_features.append(mrna) 

193 logging.debug(f"Gene {gene.id}: moved {len(cdss)} CDSs to the mRNA") 

194 

195 

196def _check_sub_exons(mrna: GFFSeqFeature, cdss: List[GFFSeqFeature], sub_exons: List[GFFSeqFeature]) -> None: 

197 """Check that the exons of the mRNA and the CDSs match. 

198 If there are no exons, create them from the CDSs. 

199 """ 

200 

201 new_sub_exons = [] 

202 if sub_exons: 

203 # Check that they match the CDS outside 

204 if len(sub_exons) == len(cdss): 

205 # Now that all coordinates are the same 

206 coord_exons = [f"{exon.location}" for exon in sub_exons] 

207 coord_cdss = [f"{cds.location}" for cds in cdss] 

208 

209 if coord_exons != coord_cdss: 

210 raise GFFParserError(f"Gene CDSs and exons under the mRNA {mrna.id} do not match") 

211 else: 

212 raise GFFParserError( 

213 f"Gene CDSs and exons under the mRNA {mrna.id} do not match (different count)" 

214 ) 

215 else: 

216 # No exons in the mRNA? Create them with the CDS coordinates 

217 for cur_cds in cdss: 

218 sub_exon = GFFSeqFeature(cur_cds.location, type="exon") 

219 new_sub_exons.append(sub_exon) 

220 mrna.sub_features += new_sub_exons 

221 

222 

223def remove_extra_exons(gene: GFFSeqFeature) -> None: 

224 """Remove duplicated exons existing in both the gene and the mRNAs. 

225 

226 This is a special case where a gene contains proper mRNAs, etc. but also extra exons for the same 

227 features. Those exons usually have an ID starting with "id-", so that is what we use to detect them. 

228 

229 Args: 

230 gene: Gene feature to update. 

231 

232 Raises: 

233 GFFParserError: If not all exons of this gene start with "id-". 

234 """ 

235 counts = _get_feat_counts(gene) 

236 if not counts.get("mRNA") and not counts.get("exon"): 

237 return 

238 

239 exons = [] 

240 mrnas = [] 

241 others = [] 

242 for subf in gene.sub_features: 

243 if subf.type == "exon": 

244 exons.append(subf) 

245 elif subf.type == "mRNA": 

246 mrnas.append(subf) 

247 else: 

248 others.append(subf) 

249 

250 if exons and mrnas: 

251 exon_has_id = 0 

252 # Check if the exon ids start with "id-", which is an indication that they do not belong here 

253 for exon in exons: 

254 if exon.id.startswith("id-"): 

255 exon_has_id += 1 

256 if exon_has_id == len(exons): 

257 logging.debug(f"Remove {exon_has_id} extra exons from {gene.id}") 

258 gene.sub_features = mrnas 

259 gene.sub_features += others 

260 else: 

261 raise GFFParserError(f"Can't remove extra exons for {gene.id}, not all start with 'id-'") 

262 

263 

264def remove_cds_from_pseudogene(gene: GFFSeqFeature) -> None: 

265 """Removes the CDSs from a pseudogene. 

266 

267 This assumes the CDSs are sub features of the transcript or the gene. 

268 

269 """ 

270 if gene.type != "pseudogene": 

271 return 

272 

273 gene_subfeats = [] 

274 for transcript in gene.sub_features: 

275 if transcript.type == "CDS": 

276 logging.debug(f"Remove pseudo CDS {transcript.id}") 

277 else: 

278 new_subfeats = [] 

279 for feat in transcript.sub_features: 

280 if feat.type == "CDS": 

281 logging.debug(f"Remove pseudo CDS {feat.id}") 

282 else: 

283 new_subfeats.append(feat) 

284 transcript.sub_features = new_subfeats 

285 gene_subfeats.append(transcript) 

286 gene.sub_features = gene_subfeats