Coverage for src/python/ensembl/io/genomio/seq_region/report.py: 100%

29 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-02-21 15:37 +0000

1# See the NOTICE file distributed with this work for additional information 

2# regarding copyright ownership. 

3# 

4# Licensed under the Apache License, Version 2.0 (the "License"); 

5# you may not use this file except in compliance with the License. 

6# You may obtain a copy of the License at 

7# 

8# http://www.apache.org/licenses/LICENSE-2.0 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, 

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

13# See the License for the specific language governing permissions and 

14# limitations under the License. 

15"""Object for an INSDC assembly report to expose its data and metadata easily.""" 

16 

17__all__ = [ 

18 "ReportRecord", 

19] 

20 

21import csv 

22from os import PathLike 

23from pathlib import Path 

24import re 

25from typing import Tuple 

26 

27from ensembl.utils.archive import open_gz_file 

28 

29 

30class ReportRecord: 

31 """Represent an assembly report file. Exposes 2 things: 

32 - Metadata as a dict from the comments. 

33 - A DictReader that yields all the seq_region lines of the report as dicts. 

34 """ 

35 

36 def __init__(self, report_path: Path) -> None: 

37 report_csv, metadata = self.report_to_csv(report_path) 

38 self.metadata = metadata 

39 self.reader = csv.DictReader(report_csv.splitlines(), delimiter="\t", quoting=csv.QUOTE_NONE) 

40 

41 @staticmethod 

42 def report_to_csv(report_path: PathLike) -> Tuple[str, dict]: 

43 """Returns an assembly report as a CSV string. 

44 

45 Args: 

46 report_path: Path to a seq_region file from INSDC/RefSeq. 

47 

48 Returns: 

49 The data as a string in CSV format, and the head metadata as a dictionary. 

50 

51 """ 

52 with open_gz_file(report_path) as report: 

53 data = "" 

54 metadata = {} 

55 header_line = "" 

56 for line in report: 

57 if line.startswith("#"): 

58 # Get metadata values if possible 

59 match = re.search("# (.+?): (.+?)$", line) 

60 if match: 

61 metadata[match.group(1)] = match.group(2) 

62 header_line = line 

63 else: 

64 data += line 

65 

66 if not header_line: 

67 raise ValueError("Missing header in report") 

68 data = header_line[2:].strip() + "\n" + data 

69 

70 return data, metadata