Coverage for src/python/ensembl/io/genomio/seq_region/report.py: 100%
29 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
1# See the NOTICE file distributed with this work for additional information
2# regarding copyright ownership.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Object for an INSDC assembly report to expose its data and metadata easily."""
17__all__ = [
18 "ReportRecord",
19]
21import csv
22from os import PathLike
23from pathlib import Path
24import re
25from typing import Tuple
27from ensembl.utils.archive import open_gz_file
30class ReportRecord:
31 """Represent an assembly report file. Exposes 2 things:
32 - Metadata as a dict from the comments.
33 - A DictReader that yields all the seq_region lines of the report as dicts.
34 """
36 def __init__(self, report_path: Path) -> None:
37 report_csv, metadata = self.report_to_csv(report_path)
38 self.metadata = metadata
39 self.reader = csv.DictReader(report_csv.splitlines(), delimiter="\t", quoting=csv.QUOTE_NONE)
41 @staticmethod
42 def report_to_csv(report_path: PathLike) -> Tuple[str, dict]:
43 """Returns an assembly report as a CSV string.
45 Args:
46 report_path: Path to a seq_region file from INSDC/RefSeq.
48 Returns:
49 The data as a string in CSV format, and the head metadata as a dictionary.
51 """
52 with open_gz_file(report_path) as report:
53 data = ""
54 metadata = {}
55 header_line = ""
56 for line in report:
57 if line.startswith("#"):
58 # Get metadata values if possible
59 match = re.search("# (.+?): (.+?)$", line)
60 if match:
61 metadata[match.group(1)] = match.group(2)
62 header_line = line
63 else:
64 data += line
66 if not header_line:
67 raise ValueError("Missing header in report")
68 data = header_line[2:].strip() + "\n" + data
70 return data, metadata