Coverage for src/python/ensembl/io/genomio/genome_metadata/extend.py: 91%
84 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
1# See the NOTICE file distributed with this work for additional information
2# regarding copyright ownership.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Update a genome metadata file to include additional sequence regions (e.g. MT chromosome)."""
17__all__ = [
18 "get_additions",
19 "get_gbff_regions",
20 "get_report_regions_names",
21 "amend_genome_metadata",
22]
24import csv
25from os import PathLike
26from pathlib import Path
27import re
28from typing import Dict, List, Tuple, Optional
30from Bio import SeqIO
32import ensembl.io.genomio
33from ensembl.io.genomio.utils import get_json, print_json
34from ensembl.utils.archive import open_gz_file
35from ensembl.utils.argparse import ArgumentParser
36from ensembl.utils.logging import init_logging_with_args
39_VERSION_END = re.compile(r"\.\d+$")
42def get_additions(report_path: PathLike, gbff_path: Optional[PathLike]) -> List[str]:
43 """Returns all `seq_regions` that are mentioned in the report but that are not in the data.
45 Args:
46 report_path: Path to the report file.
47 gbff_path: Path to the GBFF file.
48 """
49 gbff_regions = set(get_gbff_regions(gbff_path))
50 report_regions = get_report_regions_names(report_path)
51 additions = []
52 for seq_region_name in report_regions:
53 (genbank_seq_name, refseq_seq_name) = seq_region_name
54 if genbank_seq_name not in gbff_regions and refseq_seq_name not in gbff_regions:
55 if refseq_seq_name:
56 additions.append(refseq_seq_name)
57 else:
58 additions.append(genbank_seq_name)
59 additions = sorted(additions)
60 return additions
63def get_gbff_regions(gbff_path: Optional[PathLike]) -> List[str]:
64 """Returns the `seq_region` data from a GBFF file.
66 Args:
67 gbff_path: GBFF file path to use.
68 """
69 seq_regions = []
70 if gbff_path:
71 with open_gz_file(gbff_path) as gbff_file:
72 for record in SeqIO.parse(gbff_file, "genbank"):
73 record_id = re.sub(_VERSION_END, "", record.id)
74 seq_regions.append(record_id)
75 return seq_regions
78def _report_to_csv(report_path: PathLike) -> Tuple[str, Dict]:
79 """Returns the assembly report as a CSV string, and its metadata as a dictionary.
81 Args:
82 report_path: Path to the assembly report file from INSDC/RefSeq.
83 """
84 data = ""
85 metadata = {}
86 with Path(report_path).open("r") as report:
87 prev_line = ""
88 for line in report:
89 if line.startswith("#"):
90 # Get metadata values if possible
91 match = re.search(r"^#\s*([^:]+?):\s+(.+?)\s*$", line)
92 if match:
93 metadata[match.group(1)] = match.group(2)
94 prev_line = line
95 else:
96 if prev_line:
97 # Add previous line as header of CSV string, removing the initial "# "
98 data += prev_line[2:].strip() + "\n"
99 prev_line = ""
100 data += line
101 return data, metadata
104def get_report_regions_names(report_path: PathLike) -> List[Tuple[str, str]]:
105 """Returns a list of GenBank-RefSeq `seq_region` names from the assembly report file.
107 Args:
108 report_path: Path to the assembly report file from INSDC/RefSeq.
109 """
110 # Get the report in a CSV format, easier to manipulate
111 report_csv, _ = _report_to_csv(report_path)
112 # Feed the CSV string to the CSV reader
113 reader = csv.DictReader(report_csv.splitlines(), delimiter="\t", quoting=csv.QUOTE_NONE)
114 # Create the seq_regions
115 seq_regions = []
116 for row in reader:
117 refseq_name = row["RefSeq-Accn"]
118 genbank_name = row["GenBank-Accn"]
119 if refseq_name == "na":
120 refseq_name = ""
121 if genbank_name == "na":
122 genbank_name = ""
123 refseq_name = re.sub(_VERSION_END, "", refseq_name)
124 genbank_name = re.sub(_VERSION_END, "", genbank_name)
125 seq_regions.append((genbank_name, refseq_name))
126 return seq_regions
129def amend_genome_metadata(
130 genome_infile: PathLike,
131 genome_outfile: PathLike,
132 report_file: Optional[PathLike] = None,
133 genbank_file: Optional[PathLike] = None,
134) -> None:
135 """
136 Args:
137 genome_infile: Genome metadata following the `src/python/ensembl/io/genomio/data/schemas/genome.json`.
138 genome_outfile: Amended genome metadata file.
139 report_file: INSDC/RefSeq sequences report file.
140 genbank_file: INSDC/RefSeq GBFF file.
141 """
142 genome_metadata = get_json(genome_infile)
143 # Get additional sequences in the assembly but not in the data
144 if report_file:
145 genbank_path = Path(genbank_file) if genbank_file else None
146 additions = get_additions(report_file, genbank_path)
147 if additions:
148 genome_metadata["added_seq"] = {"region_name": additions}
149 # Print out the file
150 genome_outfile = Path(genome_outfile)
151 print_json(genome_outfile, genome_metadata)
154def main() -> None:
155 """Module's entry-point."""
156 parser = ArgumentParser(description=__doc__)
157 parser.add_argument_src_path(
158 "--genome_infile",
159 required=True,
160 help="Input genome metadata file (following src/python/ensembl/io/genomio/data/schemas/genome.json)",
161 )
162 parser.add_argument_dst_path(
163 "--genome_outfile", required=True, help="Path to the new amended genome metadata file"
164 )
165 parser.add_argument_src_path("--report_file", help="INSDC/RefSeq sequences report file")
166 parser.add_argument_src_path("--genbank_file", help="INSDC/RefSeq GBFF file")
167 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
168 parser.add_log_arguments()
169 args = parser.parse_args()
170 init_logging_with_args(args)
172 amend_genome_metadata(
173 genome_infile=args.genome_infile,
174 genome_outfile=args.genome_outfile,
175 report_file=args.report_file,
176 genbank_file=args.genbank_file,
177 )