Coverage for src/python/ensembl/io/genomio/gff3/restructure.py: 99%
135 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
1# See the NOTICE file distributed with this work for additional information
2# regarding copyright ownership.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Restructure a gene model to a standard representation: `gene -> [ mRNAs -> [CDSs, exons] ]`"""
17__all__ = [
18 "restructure_gene",
19 "add_transcript_to_naked_gene",
20 "move_only_cdss_to_new_mrna",
21 "move_only_exons_to_new_mrna",
22 "move_cds_to_existing_mrna",
23 "remove_extra_exons",
24 "remove_cds_from_pseudogene",
25]
27from collections import Counter
28import logging
29from typing import List
31from .exceptions import GFFParserError
32from .features import GFFSeqFeature
35def _get_feat_counts(gene: GFFSeqFeature) -> Counter:
36 return Counter([feat.type for feat in gene.sub_features])
39def restructure_gene(gene: GFFSeqFeature) -> None:
40 """Standardize the structure of a gene model:
41 - Add a transcript if there are no children
42 - Move the CDS and exons to an mRNA if they are directly under the gene
44 Args:
45 gene: Gene feature to restructure.
47 Raises:
48 GFFParserError: If there are CDSs/exons remaining under the gene after applying the fixes.
49 """
50 # Skip if the children of the gene look ok
51 counts = _get_feat_counts(gene)
52 if (len(counts) > 0) and not counts.get("CDS") and not counts.get("exon"):
53 return
55 # Make sure the gene has a transcript if nothing else
56 add_transcript_to_naked_gene(gene)
58 # Corrections if there are CDSs or exons directly under the gene level
59 move_only_cdss_to_new_mrna(gene)
60 move_only_exons_to_new_mrna(gene)
61 move_cds_to_existing_mrna(gene)
62 remove_extra_exons(gene)
64 # Check again after fixes that no CDS or exon remain under the gene
65 counts = _get_feat_counts(gene)
66 if counts.get("CDS") or counts.get("exon"): 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true
67 raise GFFParserError(f"Gene {gene.id} contains direct CDSs and exons children")
70def add_transcript_to_naked_gene(gene: GFFSeqFeature) -> None:
71 """Add an unspecific transcript to a gene without any sub features."""
73 if (len(gene.sub_features) > 0) or (gene.type != "gene"):
74 return
76 transcript = GFFSeqFeature(gene.location, type="transcript")
77 transcript.qualifiers["source"] = gene.qualifiers["source"]
78 gene.sub_features = [transcript]
79 logging.debug(f"Inserted 1 transcript for a lone gene {gene.id}")
82def move_only_cdss_to_new_mrna(gene: GFFSeqFeature) -> None:
83 """Add intermediate mRNAs to a gene with only CDS children.
84 Do nothing if some sub-features are not CDS.
85 """
87 counts = _get_feat_counts(gene)
88 if (len(counts) != 1) or not counts.get("CDS"):
89 return
91 transcripts_dict = {}
93 for cds in gene.sub_features:
94 # We create as many transcripts as there are different CDS IDs
95 if cds.id not in transcripts_dict:
96 logging.debug(f"Create a new mRNA for {cds.id}")
97 transcript = GFFSeqFeature(gene.location, type="mRNA")
98 transcript.qualifiers["source"] = gene.qualifiers["source"]
99 transcripts_dict[cds.id] = transcript
101 # Add the CDS to the transcript
102 transcripts_dict[cds.id].sub_features.append(cds)
104 # Also add an exon in the same location
105 exon = GFFSeqFeature(cds.location, type="exon")
106 exon.qualifiers["source"] = gene.qualifiers["source"]
107 transcripts_dict[cds.id].sub_features.append(exon)
109 transcripts = list(transcripts_dict.values())
110 gene.sub_features = transcripts
112 logging.debug(f"Insert transcript-exon feats for {gene.id} ({len(transcripts)} CDSs)")
115def move_only_exons_to_new_mrna(gene: GFFSeqFeature) -> None:
116 """Add an mRNA for a gene that only has exons and move the exons under the mRNA.
117 No change if the gene has other sub_features than exon.
118 """
120 counts = _get_feat_counts(gene)
121 if (len(counts) != 1) or not counts.get("exon"):
122 return
124 transcript = GFFSeqFeature(gene.location, type="mRNA")
125 transcript.qualifiers["source"] = gene.qualifiers["source"]
126 transcript.sub_features = gene.sub_features
127 gene.sub_features = [transcript]
129 logging.debug(f"Insert transcript for {gene.id} ({len(gene.sub_features)} exons)")
132def move_cds_to_existing_mrna(gene: GFFSeqFeature) -> None:
133 """Move CDS child features of a gene to the mRNA.
135 This is to fix the case where we have the following structure::
136 gene -> [ mRNA, CDSs ]
138 and change it to::
139 gene -> [ mRNA -> [ CDSs ] ]
141 The mRNA itself might have exons, in which case check that they match the CDS coordinates.
143 Args:
144 gene: Gene feature to update.
146 Raises:
147 GFFParserError: If the feature structure is not recognized.
148 """
149 counts = _get_feat_counts(gene)
150 if not counts.get("mRNA") or not counts.get("CDS"):
151 return
152 if counts["mRNA"] > 1:
153 raise GFFParserError(
154 f"Can't fix gene {gene.id}: contains several mRNAs and CDSs, all children of the gene"
155 )
157 # First, count the types
158 mrnas = []
159 cdss = []
161 gene_subf_clean = []
162 for subf in gene.sub_features:
163 if subf.type == "mRNA":
164 mrnas.append(subf)
165 elif subf.type == "CDS":
166 cdss.append(subf)
167 else:
168 gene_subf_clean.append(subf)
170 mrna = mrnas[0]
172 # Check if there are exons (or CDSs) under the mRNA
173 sub_cdss = []
174 sub_exons = []
175 for subf in mrna.sub_features:
176 if subf.type == "CDS":
177 sub_cdss.append(subf)
178 elif subf.type == "exon":
179 sub_exons.append(subf)
181 # Check sub CDSs
182 if sub_cdss:
183 raise GFFParserError(f"Gene {gene.id} has CDSs as children in both the gene and mRNA")
185 # If there are exons, check they overlap with the CDSs
186 _check_sub_exons(mrna, cdss, sub_exons)
188 # No more issues? Move the CDSs, and add any new exons
189 mrna.sub_features += cdss
190 # And remove them from the gene
191 gene.sub_features = gene_subf_clean
192 gene.sub_features.append(mrna)
193 logging.debug(f"Gene {gene.id}: moved {len(cdss)} CDSs to the mRNA")
196def _check_sub_exons(mrna: GFFSeqFeature, cdss: List[GFFSeqFeature], sub_exons: List[GFFSeqFeature]) -> None:
197 """Check that the exons of the mRNA and the CDSs match.
198 If there are no exons, create them from the CDSs.
199 """
201 new_sub_exons = []
202 if sub_exons:
203 # Check that they match the CDS outside
204 if len(sub_exons) == len(cdss):
205 # Now that all coordinates are the same
206 coord_exons = [f"{exon.location}" for exon in sub_exons]
207 coord_cdss = [f"{cds.location}" for cds in cdss]
209 if coord_exons != coord_cdss:
210 raise GFFParserError(f"Gene CDSs and exons under the mRNA {mrna.id} do not match")
211 else:
212 raise GFFParserError(
213 f"Gene CDSs and exons under the mRNA {mrna.id} do not match (different count)"
214 )
215 else:
216 # No exons in the mRNA? Create them with the CDS coordinates
217 for cur_cds in cdss:
218 sub_exon = GFFSeqFeature(cur_cds.location, type="exon")
219 new_sub_exons.append(sub_exon)
220 mrna.sub_features += new_sub_exons
223def remove_extra_exons(gene: GFFSeqFeature) -> None:
224 """Remove duplicated exons existing in both the gene and the mRNAs.
226 This is a special case where a gene contains proper mRNAs, etc. but also extra exons for the same
227 features. Those exons usually have an ID starting with "id-", so that is what we use to detect them.
229 Args:
230 gene: Gene feature to update.
232 Raises:
233 GFFParserError: If not all exons of this gene start with "id-".
234 """
235 counts = _get_feat_counts(gene)
236 if not counts.get("mRNA") and not counts.get("exon"):
237 return
239 exons = []
240 mrnas = []
241 others = []
242 for subf in gene.sub_features:
243 if subf.type == "exon":
244 exons.append(subf)
245 elif subf.type == "mRNA":
246 mrnas.append(subf)
247 else:
248 others.append(subf)
250 if exons and mrnas:
251 exon_has_id = 0
252 # Check if the exon ids start with "id-", which is an indication that they do not belong here
253 for exon in exons:
254 if exon.id.startswith("id-"):
255 exon_has_id += 1
256 if exon_has_id == len(exons):
257 logging.debug(f"Remove {exon_has_id} extra exons from {gene.id}")
258 gene.sub_features = mrnas
259 gene.sub_features += others
260 else:
261 raise GFFParserError(f"Can't remove extra exons for {gene.id}, not all start with 'id-'")
264def remove_cds_from_pseudogene(gene: GFFSeqFeature) -> None:
265 """Removes the CDSs from a pseudogene.
267 This assumes the CDSs are sub features of the transcript or the gene.
269 """
270 if gene.type != "pseudogene":
271 return
273 gene_subfeats = []
274 for transcript in gene.sub_features:
275 if transcript.type == "CDS":
276 logging.debug(f"Remove pseudo CDS {transcript.id}")
277 else:
278 new_subfeats = []
279 for feat in transcript.sub_features:
280 if feat.type == "CDS":
281 logging.debug(f"Remove pseudo CDS {feat.id}")
282 else:
283 new_subfeats.append(feat)
284 transcript.sub_features = new_subfeats
285 gene_subfeats.append(transcript)
286 gene.sub_features = gene_subfeats