Coverage for src/python/ensembl/io/genomio/gff3/id_allocator.py: 94%

102 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-02-21 15:37 +0000

1# See the NOTICE file distributed with this work for additional information 

2# regarding copyright ownership. 

3# 

4# Licensed under the Apache License, Version 2.0 (the "License"); 

5# you may not use this file except in compliance with the License. 

6# You may obtain a copy of the License at 

7# 

8# http://www.apache.org/licenses/LICENSE-2.0 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, 

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

13# See the License for the specific language governing permissions and 

14# limitations under the License. 

15"""Check and allocate IDs for gene features in a GFF3 file.""" 

16 

17__all__ = ["StableIDAllocator", "InvalidStableID"] 

18 

19from dataclasses import dataclass, field 

20import logging 

21import re 

22from typing import Dict, List, Optional, Set 

23 

24from .features import GFFSeqFeature 

25 

26 

27class InvalidStableID(ValueError): 

28 """Raised when there is a problem with an stable ID.""" 

29 

30 

31@dataclass 

32class StableIDAllocator: 

33 """Set of tools to check and allocate stable IDs.""" 

34 

35 # Multiple parameters to automate various fixes 

36 skip_gene_id_validation: bool = False 

37 min_id_length: int = 7 

38 current_id_number: int = 0 

39 make_missing_stable_ids: bool = True 

40 prefix: str = "TMP_" 

41 _loaded_ids: Set = field(default_factory=set) 

42 

43 def set_prefix(self, genome: Dict) -> None: 

44 """Sets the ID prefix using the organism abbrev if it exists in the genome metadata.""" 

45 try: 

46 org = genome["BRC4"]["organism_abbrev"] 

47 except KeyError: 

48 prefix = "TMP_PREFIX_" 

49 else: 

50 prefix = "TMP_" + org + "_" 

51 self.prefix = prefix 

52 

53 def generate_gene_id(self) -> str: 

54 """Returns a new unique gene stable_id with a prefix. 

55 

56 The ID is made up of a prefix and a number, which is auto incremented. 

57 

58 """ 

59 self.current_id_number += 1 

60 new_id = f"{self.prefix}{self.current_id_number}" 

61 return new_id 

62 

63 def is_valid(self, stable_id: str) -> bool: 

64 """Checks that the format of a stable ID is valid. 

65 Args: 

66 stable_id: Stable ID to validate. 

67 """ 

68 

69 if self.skip_gene_id_validation: 

70 logging.debug(f"Validation deactivated by user: '{stable_id}' not checked") 

71 return True 

72 

73 # Trna (from tRNAscan) 

74 if re.search(r"^Trna", stable_id, re.IGNORECASE): 

75 logging.debug(f"Stable ID is a tRNA from tRNA-scan: {stable_id}") 

76 return False 

77 

78 # Coordinates 

79 if re.search(r"^.+:\d+..\d+", stable_id): 

80 logging.debug(f"Stable id is a coordinate: {stable_id}") 

81 return False 

82 

83 # Special characters 

84 if re.search(r"[ |]", stable_id): 

85 logging.debug(f"Stable id contains special characters: {stable_id}") 

86 return False 

87 

88 # Min length 

89 if len(stable_id) < self.min_id_length: 

90 logging.debug(f"Stable id is too short (<{self.min_id_length}) {stable_id}") 

91 return False 

92 

93 return True 

94 

95 @staticmethod 

96 def remove_prefix(stable_id: str, prefixes: List[str]) -> str: 

97 """Returns the stable ID after removing its prefix (if any). 

98 

99 If more than one prefix may be found, only the first one is removed. 

100 

101 Args: 

102 stable_id: Stable ID to process. 

103 prefixes: List of prefixes to search for. 

104 """ 

105 

106 for prefix in prefixes: 

107 if stable_id.startswith(prefix): 

108 return stable_id[len(prefix) :] 

109 return stable_id 

110 

111 @staticmethod 

112 def generate_transcript_id(gene_id: str, number: int) -> str: 

113 """Returns a formatted transcript ID generated from a gene ID and number. 

114 Args: 

115 gene_id: Gene stable ID. 

116 number: Positive number. 

117 Raises: 

118 ValueError: If the number provided is not greater than zero. 

119 

120 """ 

121 if number < 1: 121 ↛ 122line 121 didn't jump to line 122 because the condition on line 121 was never true

122 raise ValueError("Number has to be a positive integer.") 

123 

124 transcript_id = f"{gene_id}_t{number}" 

125 return transcript_id 

126 

127 def normalize_cds_id(self, cds_id: str) -> str: 

128 """Returns a normalized version of the provided CDS ID. 

129 

130 The normalisation implies to remove any unnecessary prefixes around the CDS ID. However, if 

131 the CDS ID is still not proper, an empty string will be returned. 

132 

133 Args: 

134 cds_id: CDS ID to normalize. 

135 

136 """ 

137 

138 prefixes = ["cds-", "cds:"] 

139 normalized_cds_id = StableIDAllocator.remove_prefix(cds_id, prefixes) 

140 

141 # Special case: if the ID doesn't look like one, remove it - it needs to be regenerated 

142 if not self.is_valid(normalized_cds_id): 

143 return "" 

144 return normalized_cds_id 

145 

146 def normalize_pseudogene_cds_id(self, pseudogene: GFFSeqFeature) -> None: 

147 """Normalizes every CDS ID of the provided pseudogene. 

148 

149 Ensure each CDS from a pseudogene has a proper ID: 

150 - Different from the gene 

151 - Derived from the gene if it is not proper 

152 

153 Args: 

154 pseudogene: Pseudogene feature. 

155 """ 

156 for transcript in pseudogene.sub_features: 

157 for feat in transcript.sub_features: 

158 if feat.type == "CDS": 

159 feat.id = self.normalize_cds_id(feat.id) 

160 if feat.id in ("", pseudogene.id): 

161 feat.id = f"{transcript.id}_cds" 

162 feat.qualifiers["ID"] = feat.id 

163 

164 def normalize_gene_id(self, gene: GFFSeqFeature, refseq: Optional[bool] = False) -> str: 

165 """Returns a normalized gene stable ID. 

166 

167 Removes any unnecessary prefixes, but will generate a new stable ID if the normalized one is 

168 not recognized as valid. 

169 

170 Args: 

171 gene: Gene feature to normalize. 

172 """ 

173 prefixes = ["gene-", "gene:"] 

174 new_gene_id = StableIDAllocator.remove_prefix(gene.id, prefixes) 

175 

176 is_valid = False 

177 # Special case for RefSeq: only valid Gene IDs are LOC* 

178 if refseq: 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true

179 if new_gene_id.startswith("LOC"): 

180 is_valid = True 

181 else: 

182 is_valid = self.is_valid(new_gene_id) 

183 

184 if is_valid: 

185 return new_gene_id 

186 

187 # In case the normalized gene ID is not valid, use the GeneID 

188 logging.debug(f"Gene ID is not valid: {new_gene_id}") 

189 qual = gene.qualifiers 

190 if "Dbxref" in qual: 

191 for xref in qual["Dbxref"]: 

192 (db, value) = xref.split(":") 

193 if db != "GeneID": 

194 continue 

195 new_gene_id_base = f"{db}_{value}" 

196 new_gene_id = new_gene_id_base 

197 number = 1 

198 while new_gene_id in self._loaded_ids: 

199 number += 1 

200 new_gene_id = f"{new_gene_id_base}_{number}" 

201 if number > 10: 201 ↛ 202line 201 didn't jump to line 202 because the condition on line 201 was never true

202 raise InvalidStableID(f"Duplicate ID {new_gene_id_base} (up to {new_gene_id})") 

203 self._loaded_ids.add(new_gene_id) 

204 logging.debug(f"Using GeneID {new_gene_id} for stable_id instead of {gene.id}") 

205 return new_gene_id 

206 

207 # Make a new stable_id 

208 if self.make_missing_stable_ids: 

209 new_gene_id = self.generate_gene_id() 

210 logging.debug(f"New ID: {new_gene_id} -> {new_gene_id}") 

211 return new_gene_id 

212 raise InvalidStableID(f"Can't use invalid gene id for {gene}")