Coverage for src/python/ensembl/io/genomio/genome_metadata/extend.py: 91%

84 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-02-21 15:37 +0000

1# See the NOTICE file distributed with this work for additional information 

2# regarding copyright ownership. 

3# 

4# Licensed under the Apache License, Version 2.0 (the "License"); 

5# you may not use this file except in compliance with the License. 

6# You may obtain a copy of the License at 

7# 

8# http://www.apache.org/licenses/LICENSE-2.0 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, 

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

13# See the License for the specific language governing permissions and 

14# limitations under the License. 

15"""Update a genome metadata file to include additional sequence regions (e.g. MT chromosome).""" 

16 

17__all__ = [ 

18 "get_additions", 

19 "get_gbff_regions", 

20 "get_report_regions_names", 

21 "amend_genome_metadata", 

22] 

23 

24import csv 

25from os import PathLike 

26from pathlib import Path 

27import re 

28from typing import Dict, List, Tuple, Optional 

29 

30from Bio import SeqIO 

31 

32import ensembl.io.genomio 

33from ensembl.io.genomio.utils import get_json, print_json 

34from ensembl.utils.archive import open_gz_file 

35from ensembl.utils.argparse import ArgumentParser 

36from ensembl.utils.logging import init_logging_with_args 

37 

38 

39_VERSION_END = re.compile(r"\.\d+$") 

40 

41 

42def get_additions(report_path: PathLike, gbff_path: Optional[PathLike]) -> List[str]: 

43 """Returns all `seq_regions` that are mentioned in the report but that are not in the data. 

44 

45 Args: 

46 report_path: Path to the report file. 

47 gbff_path: Path to the GBFF file. 

48 """ 

49 gbff_regions = set(get_gbff_regions(gbff_path)) 

50 report_regions = get_report_regions_names(report_path) 

51 additions = [] 

52 for seq_region_name in report_regions: 

53 (genbank_seq_name, refseq_seq_name) = seq_region_name 

54 if genbank_seq_name not in gbff_regions and refseq_seq_name not in gbff_regions: 

55 if refseq_seq_name: 

56 additions.append(refseq_seq_name) 

57 else: 

58 additions.append(genbank_seq_name) 

59 additions = sorted(additions) 

60 return additions 

61 

62 

63def get_gbff_regions(gbff_path: Optional[PathLike]) -> List[str]: 

64 """Returns the `seq_region` data from a GBFF file. 

65 

66 Args: 

67 gbff_path: GBFF file path to use. 

68 """ 

69 seq_regions = [] 

70 if gbff_path: 

71 with open_gz_file(gbff_path) as gbff_file: 

72 for record in SeqIO.parse(gbff_file, "genbank"): 

73 record_id = re.sub(_VERSION_END, "", record.id) 

74 seq_regions.append(record_id) 

75 return seq_regions 

76 

77 

78def _report_to_csv(report_path: PathLike) -> Tuple[str, Dict]: 

79 """Returns the assembly report as a CSV string, and its metadata as a dictionary. 

80 

81 Args: 

82 report_path: Path to the assembly report file from INSDC/RefSeq. 

83 """ 

84 data = "" 

85 metadata = {} 

86 with Path(report_path).open("r") as report: 

87 prev_line = "" 

88 for line in report: 

89 if line.startswith("#"): 

90 # Get metadata values if possible 

91 match = re.search(r"^#\s*([^:]+?):\s+(.+?)\s*$", line) 

92 if match: 

93 metadata[match.group(1)] = match.group(2) 

94 prev_line = line 

95 else: 

96 if prev_line: 

97 # Add previous line as header of CSV string, removing the initial "# " 

98 data += prev_line[2:].strip() + "\n" 

99 prev_line = "" 

100 data += line 

101 return data, metadata 

102 

103 

104def get_report_regions_names(report_path: PathLike) -> List[Tuple[str, str]]: 

105 """Returns a list of GenBank-RefSeq `seq_region` names from the assembly report file. 

106 

107 Args: 

108 report_path: Path to the assembly report file from INSDC/RefSeq. 

109 """ 

110 # Get the report in a CSV format, easier to manipulate 

111 report_csv, _ = _report_to_csv(report_path) 

112 # Feed the CSV string to the CSV reader 

113 reader = csv.DictReader(report_csv.splitlines(), delimiter="\t", quoting=csv.QUOTE_NONE) 

114 # Create the seq_regions 

115 seq_regions = [] 

116 for row in reader: 

117 refseq_name = row["RefSeq-Accn"] 

118 genbank_name = row["GenBank-Accn"] 

119 if refseq_name == "na": 

120 refseq_name = "" 

121 if genbank_name == "na": 

122 genbank_name = "" 

123 refseq_name = re.sub(_VERSION_END, "", refseq_name) 

124 genbank_name = re.sub(_VERSION_END, "", genbank_name) 

125 seq_regions.append((genbank_name, refseq_name)) 

126 return seq_regions 

127 

128 

129def amend_genome_metadata( 

130 genome_infile: PathLike, 

131 genome_outfile: PathLike, 

132 report_file: Optional[PathLike] = None, 

133 genbank_file: Optional[PathLike] = None, 

134) -> None: 

135 """ 

136 Args: 

137 genome_infile: Genome metadata following the `src/python/ensembl/io/genomio/data/schemas/genome.json`. 

138 genome_outfile: Amended genome metadata file. 

139 report_file: INSDC/RefSeq sequences report file. 

140 genbank_file: INSDC/RefSeq GBFF file. 

141 """ 

142 genome_metadata = get_json(genome_infile) 

143 # Get additional sequences in the assembly but not in the data 

144 if report_file: 

145 genbank_path = Path(genbank_file) if genbank_file else None 

146 additions = get_additions(report_file, genbank_path) 

147 if additions: 

148 genome_metadata["added_seq"] = {"region_name": additions} 

149 # Print out the file 

150 genome_outfile = Path(genome_outfile) 

151 print_json(genome_outfile, genome_metadata) 

152 

153 

154def main() -> None: 

155 """Module's entry-point.""" 

156 parser = ArgumentParser(description=__doc__) 

157 parser.add_argument_src_path( 

158 "--genome_infile", 

159 required=True, 

160 help="Input genome metadata file (following src/python/ensembl/io/genomio/data/schemas/genome.json)", 

161 ) 

162 parser.add_argument_dst_path( 

163 "--genome_outfile", required=True, help="Path to the new amended genome metadata file" 

164 ) 

165 parser.add_argument_src_path("--report_file", help="INSDC/RefSeq sequences report file") 

166 parser.add_argument_src_path("--genbank_file", help="INSDC/RefSeq GBFF file") 

167 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__) 

168 parser.add_log_arguments() 

169 args = parser.parse_args() 

170 init_logging_with_args(args) 

171 

172 amend_genome_metadata( 

173 genome_infile=args.genome_infile, 

174 genome_outfile=args.genome_outfile, 

175 report_file=args.report_file, 

176 genbank_file=args.genbank_file, 

177 )