Coverage for src/python/ensembl/io/genomio/manifest/manifest.py: 97%
89 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
1# See the NOTICE file distributed with this work for additional information
2# regarding copyright ownership.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Representation of a manifest file."""
17__all__ = ["Manifest", "ManifestError"]
19import hashlib
20import json
21import logging
22from pathlib import Path
23from typing import Any, TypeAlias
26ManifestDict: TypeAlias = dict[str, dict[str, Any]]
29class ManifestError(Exception):
30 """Could not load a manifest file."""
33class Manifest:
34 """Records of a manifest file and its files and md5 checksums."""
36 _same_names = {
37 "gff3",
38 "fasta_dna",
39 "fasta_pep",
40 "functional_annotation",
41 "genome",
42 "seq_attrib",
43 "seq_region",
44 "agp",
45 "events",
46 }
47 _alias_names = {
48 "gene_models": "gff3",
49 "dna": "fasta_dna",
50 "pep": "fasta_pep",
51 }
52 _same_names_dict = {name: name for name in _same_names}
53 names = {**_same_names_dict, **_alias_names}
54 multi_files = {"agp"}
56 def __init__(self, manifest_dir: Path) -> None:
57 """Initializes a manifest with the directory containing the files (and a manifest if it exists).
59 Args:
60 manifest_dir: directory where the files are contained.
61 """
62 self.root_dir = manifest_dir
63 self.file_path = manifest_dir / "manifest.json"
64 self.files: dict = {}
66 def create(self) -> None:
67 """Creates a manifest file from the files in a directory."""
68 self.get_files_checksums()
69 with self.file_path.open("w") as json_out:
70 json_out.write(json.dumps(self.files, sort_keys=True, indent=4))
72 def get_files_checksums(self) -> ManifestDict:
73 """Records all the files in the directory with their checksum."""
74 manifest_files: ManifestDict = {}
75 for subfile in self.root_dir.iterdir():
76 logging.debug(f"Check file {subfile} ({subfile.stem}, {subfile.suffix})")
77 used_file = False
78 if subfile.is_dir():
79 logging.warning("Can't create manifest for subdirectory")
80 continue
82 # Delete and skip empty files
83 if subfile.stat().st_size == 0:
84 logging.warning(f"Skip and delete empty file: {subfile}")
85 subfile.unlink()
86 continue
88 for name, standard_name in self.names.items():
89 # Either the last element of the stem or the suffix is a known name
90 if subfile.stem.endswith(name) or subfile.suffix == f".{name}":
91 logging.debug(f"Matched to {name} ({standard_name}) = {subfile}")
92 used_file = True
93 md5 = self._get_md5sum(subfile)
94 file_obj = {"file": subfile.name, "md5sum": md5}
96 # Multiple files stored, each with a name
97 if standard_name in self.multi_files:
98 manifest_files.setdefault(standard_name, {})
99 obj_name = self._prepare_object_name(subfile, name, manifest_files[standard_name])
100 manifest_files[standard_name][obj_name] = file_obj
102 # Single file/init
103 else:
104 manifest_files[standard_name] = file_obj
106 if not used_file:
107 logging.warning(f"File {subfile} was not included in the manifest")
109 self.files = manifest_files
110 return self.files
112 def _prepare_object_name(
113 self, subfile: Path, name: str, manifest_file_dict: dict[str, dict[str, str]]
114 ) -> str:
115 # Prepare object name
116 try:
117 # If we recognize the suffix, then the name is the part after the last "_"
118 if subfile.suffix == f".{name}": 118 ↛ 122line 118 didn't jump to line 122 because the condition on line 118 was always true
119 obj_name = subfile.stem.split(sep="_")[-1]
120 # If we recognize the end of the name, then the name is the part before the last "_"
121 else:
122 obj_name = subfile.stem.split(sep="_")[-2]
123 except IndexError:
124 obj_name = "file"
126 # Add number if duplicate name
127 obj_name_base = obj_name
128 count = 1
129 while obj_name in manifest_file_dict.keys():
130 obj_name = f"{obj_name_base}.{count}"
131 count += 1
132 if count >= 10:
133 raise ValueError(f"Too many files with same name {obj_name_base}")
134 return obj_name
136 def load(self) -> ManifestDict:
137 """Load the content of an existing manifest file."""
138 if not self.file_path.exists():
139 raise ManifestError(f"Cannot load non-existing manifest file: {self.file_path}")
141 with self.file_path.open("r") as manifest_fh:
142 manifest = json.load(manifest_fh)
144 # Use dir name from the manifest
145 for name in manifest:
146 if "file" in manifest[name]:
147 file_path = self.root_dir / manifest[name]["file"]
148 # check if the md5sum is correct
149 md5sum = manifest[name]["md5sum"]
150 self._check_md5sum(file_path, md5sum)
151 else:
152 for f in manifest[name]:
153 file_path = self.root_dir / manifest[name][f]["file"]
154 # check if the md5sum is correct
155 md5sum = manifest[name][f]["md5sum"]
156 self._check_md5sum(file_path, md5sum)
158 self.files = manifest
159 return self.files
161 @staticmethod
162 def _get_md5sum(file_path: Path) -> str:
163 """Returns the md5 checksum for a given file."""
164 with file_path.open("rb") as f:
165 data_bytes = f.read()
166 return hashlib.md5(data_bytes).hexdigest()
168 def _check_md5sum(self, file_path: Path, md5sum: str) -> None:
169 """Checks a file against an md5 checksum, raises a ManifestError if the checksum fails.
171 Args:
172 file_path: Path to a genome file.
173 md5sum: MD5 hash for the files.
174 """
175 file_md5sum = self._get_md5sum(file_path)
176 if file_md5sum != md5sum:
177 raise ManifestError(f"Invalid md5 checksum for {file_path}: got {file_md5sum}, expected {md5sum}")