Coverage for src/python/ensembl/io/genomio/manifest/manifest.py: 97%

89 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-02-21 15:37 +0000

1# See the NOTICE file distributed with this work for additional information 

2# regarding copyright ownership. 

3# 

4# Licensed under the Apache License, Version 2.0 (the "License"); 

5# you may not use this file except in compliance with the License. 

6# You may obtain a copy of the License at 

7# 

8# http://www.apache.org/licenses/LICENSE-2.0 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, 

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

13# See the License for the specific language governing permissions and 

14# limitations under the License. 

15"""Representation of a manifest file.""" 

16 

17__all__ = ["Manifest", "ManifestError"] 

18 

19import hashlib 

20import json 

21import logging 

22from pathlib import Path 

23from typing import Any, TypeAlias 

24 

25 

26ManifestDict: TypeAlias = dict[str, dict[str, Any]] 

27 

28 

29class ManifestError(Exception): 

30 """Could not load a manifest file.""" 

31 

32 

33class Manifest: 

34 """Records of a manifest file and its files and md5 checksums.""" 

35 

36 _same_names = { 

37 "gff3", 

38 "fasta_dna", 

39 "fasta_pep", 

40 "functional_annotation", 

41 "genome", 

42 "seq_attrib", 

43 "seq_region", 

44 "agp", 

45 "events", 

46 } 

47 _alias_names = { 

48 "gene_models": "gff3", 

49 "dna": "fasta_dna", 

50 "pep": "fasta_pep", 

51 } 

52 _same_names_dict = {name: name for name in _same_names} 

53 names = {**_same_names_dict, **_alias_names} 

54 multi_files = {"agp"} 

55 

56 def __init__(self, manifest_dir: Path) -> None: 

57 """Initializes a manifest with the directory containing the files (and a manifest if it exists). 

58 

59 Args: 

60 manifest_dir: directory where the files are contained. 

61 """ 

62 self.root_dir = manifest_dir 

63 self.file_path = manifest_dir / "manifest.json" 

64 self.files: dict = {} 

65 

66 def create(self) -> None: 

67 """Creates a manifest file from the files in a directory.""" 

68 self.get_files_checksums() 

69 with self.file_path.open("w") as json_out: 

70 json_out.write(json.dumps(self.files, sort_keys=True, indent=4)) 

71 

72 def get_files_checksums(self) -> ManifestDict: 

73 """Records all the files in the directory with their checksum.""" 

74 manifest_files: ManifestDict = {} 

75 for subfile in self.root_dir.iterdir(): 

76 logging.debug(f"Check file {subfile} ({subfile.stem}, {subfile.suffix})") 

77 used_file = False 

78 if subfile.is_dir(): 

79 logging.warning("Can't create manifest for subdirectory") 

80 continue 

81 

82 # Delete and skip empty files 

83 if subfile.stat().st_size == 0: 

84 logging.warning(f"Skip and delete empty file: {subfile}") 

85 subfile.unlink() 

86 continue 

87 

88 for name, standard_name in self.names.items(): 

89 # Either the last element of the stem or the suffix is a known name 

90 if subfile.stem.endswith(name) or subfile.suffix == f".{name}": 

91 logging.debug(f"Matched to {name} ({standard_name}) = {subfile}") 

92 used_file = True 

93 md5 = self._get_md5sum(subfile) 

94 file_obj = {"file": subfile.name, "md5sum": md5} 

95 

96 # Multiple files stored, each with a name 

97 if standard_name in self.multi_files: 

98 manifest_files.setdefault(standard_name, {}) 

99 obj_name = self._prepare_object_name(subfile, name, manifest_files[standard_name]) 

100 manifest_files[standard_name][obj_name] = file_obj 

101 

102 # Single file/init 

103 else: 

104 manifest_files[standard_name] = file_obj 

105 

106 if not used_file: 

107 logging.warning(f"File {subfile} was not included in the manifest") 

108 

109 self.files = manifest_files 

110 return self.files 

111 

112 def _prepare_object_name( 

113 self, subfile: Path, name: str, manifest_file_dict: dict[str, dict[str, str]] 

114 ) -> str: 

115 # Prepare object name 

116 try: 

117 # If we recognize the suffix, then the name is the part after the last "_" 

118 if subfile.suffix == f".{name}": 118 ↛ 122line 118 didn't jump to line 122 because the condition on line 118 was always true

119 obj_name = subfile.stem.split(sep="_")[-1] 

120 # If we recognize the end of the name, then the name is the part before the last "_" 

121 else: 

122 obj_name = subfile.stem.split(sep="_")[-2] 

123 except IndexError: 

124 obj_name = "file" 

125 

126 # Add number if duplicate name 

127 obj_name_base = obj_name 

128 count = 1 

129 while obj_name in manifest_file_dict.keys(): 

130 obj_name = f"{obj_name_base}.{count}" 

131 count += 1 

132 if count >= 10: 

133 raise ValueError(f"Too many files with same name {obj_name_base}") 

134 return obj_name 

135 

136 def load(self) -> ManifestDict: 

137 """Load the content of an existing manifest file.""" 

138 if not self.file_path.exists(): 

139 raise ManifestError(f"Cannot load non-existing manifest file: {self.file_path}") 

140 

141 with self.file_path.open("r") as manifest_fh: 

142 manifest = json.load(manifest_fh) 

143 

144 # Use dir name from the manifest 

145 for name in manifest: 

146 if "file" in manifest[name]: 

147 file_path = self.root_dir / manifest[name]["file"] 

148 # check if the md5sum is correct 

149 md5sum = manifest[name]["md5sum"] 

150 self._check_md5sum(file_path, md5sum) 

151 else: 

152 for f in manifest[name]: 

153 file_path = self.root_dir / manifest[name][f]["file"] 

154 # check if the md5sum is correct 

155 md5sum = manifest[name][f]["md5sum"] 

156 self._check_md5sum(file_path, md5sum) 

157 

158 self.files = manifest 

159 return self.files 

160 

161 @staticmethod 

162 def _get_md5sum(file_path: Path) -> str: 

163 """Returns the md5 checksum for a given file.""" 

164 with file_path.open("rb") as f: 

165 data_bytes = f.read() 

166 return hashlib.md5(data_bytes).hexdigest() 

167 

168 def _check_md5sum(self, file_path: Path, md5sum: str) -> None: 

169 """Checks a file against an md5 checksum, raises a ManifestError if the checksum fails. 

170 

171 Args: 

172 file_path: Path to a genome file. 

173 md5sum: MD5 hash for the files. 

174 """ 

175 file_md5sum = self._get_md5sum(file_path) 

176 if file_md5sum != md5sum: 

177 raise ManifestError(f"Invalid md5 checksum for {file_path}: got {file_md5sum}, expected {md5sum}")