Coverage for src/python/ensembl/io/genomio/genbank/download.py: 76%

32 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-02-21 15:37 +0000

1# See the NOTICE file distributed with this work for additional information 

2# regarding copyright ownership. 

3# 

4# Licensed under the Apache License, Version 2.0 (the "License"); 

5# you may not use this file except in compliance with the License. 

6# You may obtain a copy of the License at 

7# 

8# http://www.apache.org/licenses/LICENSE-2.0 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, 

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

13# See the License for the specific language governing permissions and 

14# limitations under the License. 

15"""Download a Genbank file from NCBI from an accession.""" 

16 

17__all__ = ["DownloadError", "download_genbank"] 

18 

19import logging 

20from os import PathLike 

21from pathlib import Path 

22 

23import requests 

24 

25import ensembl.io.genomio 

26from ensembl.utils.argparse import ArgumentParser 

27from ensembl.utils.logging import init_logging_with_args 

28 

29 

30class DownloadError(Exception): 

31 """In case a download failed.""" 

32 

33 def __init__(self, msg: str) -> None: 

34 self.msg = msg 

35 

36 

37def download_genbank(accession: str, output_file: PathLike) -> None: 

38 """Given a GenBank accession, download the corresponding file in GenBank format. 

39 

40 Uses NCBI Entrez service to fetch the data. 

41 

42 Args: 

43 accession: INSDC Genbank record accession. 

44 output_file: Path to the downloaded record in Genbank format. 

45 

46 Raises: 

47 DownloadError: If the download fails. 

48 

49 """ 

50 

51 # Get the list of assemblies for this accession 

52 entrez_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" 

53 entrez_params = { 

54 "db": "nuccore", 

55 "rettype": "gbwithparts", 

56 "retmode": "text", 

57 } 

58 entrez_params["id"] = accession 

59 logging.debug(f"Getting file from {entrez_url} with params {entrez_params}") 

60 result = requests.get(entrez_url, params=entrez_params, timeout=60) 

61 if result and result.status_code == 200: 

62 with Path(output_file).open("wb") as gbff: 

63 gbff.write(result.content) 

64 logging.info(f"GenBank file written to {output_file}") 

65 return 

66 raise DownloadError(f"Could not download the genbank ({accession}) file: {result}") 

67 

68 

69def main() -> None: 

70 """Main script entry-point.""" 

71 parser = ArgumentParser(description="Download a sequence from GenBank.") 

72 parser.add_argument("--accession", required=True, help="Sequence accession") 

73 parser.add_argument_dst_path("--output_file", required=True, help="Output GenBank file") 

74 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__) 

75 parser.add_log_arguments() 

76 args = parser.parse_args() 

77 init_logging_with_args(args) 

78 

79 download_genbank(accession=args.accession, output_file=args.output_file) 

80 

81 

82if __name__ == "__main__": 

83 main()