Coverage for src/python/ensembl/io/genomio/seq_region/gbff.py: 100%
41 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
1# See the NOTICE file distributed with this work for additional information
2# regarding copyright ownership.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""A `SeqRecord` wrapper."""
17__all__ = [
18 "GBFFRecord",
19]
21from dataclasses import dataclass
22import re
23from typing import Mapping
25from Bio.SeqRecord import SeqRecord
27from ensembl.io.genomio.seq_region.mappings import MOLECULE_LOCATION
28from ensembl.io.genomio.seq_region.exceptions import UnknownMetadata
31@dataclass
32class GBFFRecord:
33 """Wrapper around a `SeqRecord` object to extract specific data."""
35 record: SeqRecord
37 def get_genbank_id(self) -> str | None:
38 """Returns the GenBank accession from a given sequence record (if present).
40 Only useful for RefSeq sequence records, where the GenBank accession is stored in a comment.
42 Args:
43 record: Sequence record.
45 """
46 comment = str(self.record.annotations.get("comment", ""))
47 if not comment:
48 return None
49 comment = re.sub(r"[ \n\r]+", " ", comment)
50 match = re.search(r"The reference sequence was derived from ([^\.]+)\.", comment)
51 if not match:
52 return None
53 return match.group(1)
55 def get_codon_table(self) -> int | None:
56 """Returns the codon table number from a given a GenBank sequence record (if present)."""
57 for feat in self.record.features:
58 if "transl_table" in feat.qualifiers:
59 return int(feat.qualifiers["transl_table"][0])
60 return None
62 def get_organelle(self, molecule_location: Mapping[str, str] = MOLECULE_LOCATION) -> str | None:
63 """Returns the organelle location from the given GenBank record (if present).
65 Args:
66 record: GenBank sequence record.
67 molecule_location: Map of sequence type to SO location.
69 Raises:
70 UnknownMetadata: If the location is not part of the controlled vocabulary.
72 """
73 location = None
74 for feat in self.record.features:
75 if "organelle" not in feat.qualifiers:
76 continue
77 organelle = str(feat.qualifiers["organelle"][0])
78 # Remove plastid prefix
79 with_prefix = re.match(r"^(plastid|mitochondrion):(.+)$", organelle)
80 if with_prefix:
81 organelle = with_prefix[2]
82 # Get controlled name
83 try:
84 location = molecule_location[organelle]
85 except KeyError as exc:
86 raise UnknownMetadata(f"Unrecognized sequence location: {organelle}") from exc
87 break
88 return location
90 def is_circular(self) -> bool:
91 """Returns True if the record says that the sequence is circular, False otherwise."""
92 return self.record.annotations.get("topology", "") == "circular"