Coverage for src/python/ensembl/io/genomio/seq_region/gbff.py: 100%

41 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-02-21 15:37 +0000

1# See the NOTICE file distributed with this work for additional information 

2# regarding copyright ownership. 

3# 

4# Licensed under the Apache License, Version 2.0 (the "License"); 

5# you may not use this file except in compliance with the License. 

6# You may obtain a copy of the License at 

7# 

8# http://www.apache.org/licenses/LICENSE-2.0 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, 

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

13# See the License for the specific language governing permissions and 

14# limitations under the License. 

15"""A `SeqRecord` wrapper.""" 

16 

17__all__ = [ 

18 "GBFFRecord", 

19] 

20 

21from dataclasses import dataclass 

22import re 

23from typing import Mapping 

24 

25from Bio.SeqRecord import SeqRecord 

26 

27from ensembl.io.genomio.seq_region.mappings import MOLECULE_LOCATION 

28from ensembl.io.genomio.seq_region.exceptions import UnknownMetadata 

29 

30 

31@dataclass 

32class GBFFRecord: 

33 """Wrapper around a `SeqRecord` object to extract specific data.""" 

34 

35 record: SeqRecord 

36 

37 def get_genbank_id(self) -> str | None: 

38 """Returns the GenBank accession from a given sequence record (if present). 

39 

40 Only useful for RefSeq sequence records, where the GenBank accession is stored in a comment. 

41 

42 Args: 

43 record: Sequence record. 

44 

45 """ 

46 comment = str(self.record.annotations.get("comment", "")) 

47 if not comment: 

48 return None 

49 comment = re.sub(r"[ \n\r]+", " ", comment) 

50 match = re.search(r"The reference sequence was derived from ([^\.]+)\.", comment) 

51 if not match: 

52 return None 

53 return match.group(1) 

54 

55 def get_codon_table(self) -> int | None: 

56 """Returns the codon table number from a given a GenBank sequence record (if present).""" 

57 for feat in self.record.features: 

58 if "transl_table" in feat.qualifiers: 

59 return int(feat.qualifiers["transl_table"][0]) 

60 return None 

61 

62 def get_organelle(self, molecule_location: Mapping[str, str] = MOLECULE_LOCATION) -> str | None: 

63 """Returns the organelle location from the given GenBank record (if present). 

64 

65 Args: 

66 record: GenBank sequence record. 

67 molecule_location: Map of sequence type to SO location. 

68 

69 Raises: 

70 UnknownMetadata: If the location is not part of the controlled vocabulary. 

71 

72 """ 

73 location = None 

74 for feat in self.record.features: 

75 if "organelle" not in feat.qualifiers: 

76 continue 

77 organelle = str(feat.qualifiers["organelle"][0]) 

78 # Remove plastid prefix 

79 with_prefix = re.match(r"^(plastid|mitochondrion):(.+)$", organelle) 

80 if with_prefix: 

81 organelle = with_prefix[2] 

82 # Get controlled name 

83 try: 

84 location = molecule_location[organelle] 

85 except KeyError as exc: 

86 raise UnknownMetadata(f"Unrecognized sequence location: {organelle}") from exc 

87 break 

88 return location 

89 

90 def is_circular(self) -> bool: 

91 """Returns True if the record says that the sequence is circular, False otherwise.""" 

92 return self.record.annotations.get("topology", "") == "circular"