Coverage for src/python/ensembl/io/genomio/gff3/id_allocator.py: 94%
102 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
1# See the NOTICE file distributed with this work for additional information
2# regarding copyright ownership.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Check and allocate IDs for gene features in a GFF3 file."""
17__all__ = ["StableIDAllocator", "InvalidStableID"]
19from dataclasses import dataclass, field
20import logging
21import re
22from typing import Dict, List, Optional, Set
24from .features import GFFSeqFeature
27class InvalidStableID(ValueError):
28 """Raised when there is a problem with an stable ID."""
31@dataclass
32class StableIDAllocator:
33 """Set of tools to check and allocate stable IDs."""
35 # Multiple parameters to automate various fixes
36 skip_gene_id_validation: bool = False
37 min_id_length: int = 7
38 current_id_number: int = 0
39 make_missing_stable_ids: bool = True
40 prefix: str = "TMP_"
41 _loaded_ids: Set = field(default_factory=set)
43 def set_prefix(self, genome: Dict) -> None:
44 """Sets the ID prefix using the organism abbrev if it exists in the genome metadata."""
45 try:
46 org = genome["BRC4"]["organism_abbrev"]
47 except KeyError:
48 prefix = "TMP_PREFIX_"
49 else:
50 prefix = "TMP_" + org + "_"
51 self.prefix = prefix
53 def generate_gene_id(self) -> str:
54 """Returns a new unique gene stable_id with a prefix.
56 The ID is made up of a prefix and a number, which is auto incremented.
58 """
59 self.current_id_number += 1
60 new_id = f"{self.prefix}{self.current_id_number}"
61 return new_id
63 def is_valid(self, stable_id: str) -> bool:
64 """Checks that the format of a stable ID is valid.
65 Args:
66 stable_id: Stable ID to validate.
67 """
69 if self.skip_gene_id_validation:
70 logging.debug(f"Validation deactivated by user: '{stable_id}' not checked")
71 return True
73 # Trna (from tRNAscan)
74 if re.search(r"^Trna", stable_id, re.IGNORECASE):
75 logging.debug(f"Stable ID is a tRNA from tRNA-scan: {stable_id}")
76 return False
78 # Coordinates
79 if re.search(r"^.+:\d+..\d+", stable_id):
80 logging.debug(f"Stable id is a coordinate: {stable_id}")
81 return False
83 # Special characters
84 if re.search(r"[ |]", stable_id):
85 logging.debug(f"Stable id contains special characters: {stable_id}")
86 return False
88 # Min length
89 if len(stable_id) < self.min_id_length:
90 logging.debug(f"Stable id is too short (<{self.min_id_length}) {stable_id}")
91 return False
93 return True
95 @staticmethod
96 def remove_prefix(stable_id: str, prefixes: List[str]) -> str:
97 """Returns the stable ID after removing its prefix (if any).
99 If more than one prefix may be found, only the first one is removed.
101 Args:
102 stable_id: Stable ID to process.
103 prefixes: List of prefixes to search for.
104 """
106 for prefix in prefixes:
107 if stable_id.startswith(prefix):
108 return stable_id[len(prefix) :]
109 return stable_id
111 @staticmethod
112 def generate_transcript_id(gene_id: str, number: int) -> str:
113 """Returns a formatted transcript ID generated from a gene ID and number.
114 Args:
115 gene_id: Gene stable ID.
116 number: Positive number.
117 Raises:
118 ValueError: If the number provided is not greater than zero.
120 """
121 if number < 1: 121 ↛ 122line 121 didn't jump to line 122 because the condition on line 121 was never true
122 raise ValueError("Number has to be a positive integer.")
124 transcript_id = f"{gene_id}_t{number}"
125 return transcript_id
127 def normalize_cds_id(self, cds_id: str) -> str:
128 """Returns a normalized version of the provided CDS ID.
130 The normalisation implies to remove any unnecessary prefixes around the CDS ID. However, if
131 the CDS ID is still not proper, an empty string will be returned.
133 Args:
134 cds_id: CDS ID to normalize.
136 """
138 prefixes = ["cds-", "cds:"]
139 normalized_cds_id = StableIDAllocator.remove_prefix(cds_id, prefixes)
141 # Special case: if the ID doesn't look like one, remove it - it needs to be regenerated
142 if not self.is_valid(normalized_cds_id):
143 return ""
144 return normalized_cds_id
146 def normalize_pseudogene_cds_id(self, pseudogene: GFFSeqFeature) -> None:
147 """Normalizes every CDS ID of the provided pseudogene.
149 Ensure each CDS from a pseudogene has a proper ID:
150 - Different from the gene
151 - Derived from the gene if it is not proper
153 Args:
154 pseudogene: Pseudogene feature.
155 """
156 for transcript in pseudogene.sub_features:
157 for feat in transcript.sub_features:
158 if feat.type == "CDS":
159 feat.id = self.normalize_cds_id(feat.id)
160 if feat.id in ("", pseudogene.id):
161 feat.id = f"{transcript.id}_cds"
162 feat.qualifiers["ID"] = feat.id
164 def normalize_gene_id(self, gene: GFFSeqFeature, refseq: Optional[bool] = False) -> str:
165 """Returns a normalized gene stable ID.
167 Removes any unnecessary prefixes, but will generate a new stable ID if the normalized one is
168 not recognized as valid.
170 Args:
171 gene: Gene feature to normalize.
172 """
173 prefixes = ["gene-", "gene:"]
174 new_gene_id = StableIDAllocator.remove_prefix(gene.id, prefixes)
176 is_valid = False
177 # Special case for RefSeq: only valid Gene IDs are LOC*
178 if refseq: 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true
179 if new_gene_id.startswith("LOC"):
180 is_valid = True
181 else:
182 is_valid = self.is_valid(new_gene_id)
184 if is_valid:
185 return new_gene_id
187 # In case the normalized gene ID is not valid, use the GeneID
188 logging.debug(f"Gene ID is not valid: {new_gene_id}")
189 qual = gene.qualifiers
190 if "Dbxref" in qual:
191 for xref in qual["Dbxref"]:
192 (db, value) = xref.split(":")
193 if db != "GeneID":
194 continue
195 new_gene_id_base = f"{db}_{value}"
196 new_gene_id = new_gene_id_base
197 number = 1
198 while new_gene_id in self._loaded_ids:
199 number += 1
200 new_gene_id = f"{new_gene_id_base}_{number}"
201 if number > 10: 201 ↛ 202line 201 didn't jump to line 202 because the condition on line 201 was never true
202 raise InvalidStableID(f"Duplicate ID {new_gene_id_base} (up to {new_gene_id})")
203 self._loaded_ids.add(new_gene_id)
204 logging.debug(f"Using GeneID {new_gene_id} for stable_id instead of {gene.id}")
205 return new_gene_id
207 # Make a new stable_id
208 if self.make_missing_stable_ids:
209 new_gene_id = self.generate_gene_id()
210 logging.debug(f"New ID: {new_gene_id} -> {new_gene_id}")
211 return new_gene_id
212 raise InvalidStableID(f"Can't use invalid gene id for {gene}")