Coverage for src/python/ensembl/io/genomio/fasta/process.py: 83%

53 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-02-21 15:37 +0000

1# See the NOTICE file distributed with this work for additional information 

2# regarding copyright ownership. 

3# 

4# Licensed under the Apache License, Version 2.0 (the "License"); 

5# you may not use this file except in compliance with the License. 

6# You may obtain a copy of the License at 

7# 

8# http://www.apache.org/licenses/LICENSE-2.0 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, 

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

13# See the License for the specific language governing permissions and 

14# limitations under the License. 

15"""Takes a FASTA file (DNA or peptide), cleans it up and optionally excludes some IDs.""" 

16 

17__all__ = ["FastaParserError", "get_peptides_to_exclude", "prep_fasta_data"] 

18 

19import logging 

20from pathlib import Path 

21from os import PathLike 

22from typing import List, Optional, Set 

23 

24from Bio import SeqIO 

25 

26import ensembl.io.genomio 

27from ensembl.utils.archive import open_gz_file 

28from ensembl.utils.argparse import ArgumentParser 

29from ensembl.utils.logging import init_logging_with_args 

30 

31 

32exclude_seq_regions: List[str] = [] 

33 

34 

35class FastaParserError(Exception): 

36 """Error while parsing a FASTA file.""" 

37 

38 

39def get_peptides_to_exclude(genbank_path: PathLike, seqr_to_exclude: Set[str]) -> Set[str]: 

40 """ 

41 Extract peptide IDs from a genbank file that are in a given list of seq regions 

42 """ 

43 peptides_to_exclude: Set[str] = set() 

44 with open_gz_file(genbank_path) as in_genbank: 

45 for record in SeqIO.parse(in_genbank, "genbank"): 

46 if record.id in seqr_to_exclude: 

47 logging.info(f"Skip sequence {record.id}") 

48 for feat in record.features: 

49 if feat.type == "CDS": 

50 if "protein_id" in feat.qualifiers: 

51 feat_id = feat.qualifiers["protein_id"] 

52 peptides_to_exclude.add(feat_id[0]) 

53 else: 

54 raise FastaParserError(f"Peptide without peptide ID ${feat}") 

55 return peptides_to_exclude 

56 

57 

58def prep_fasta_data( 

59 fasta_infile: PathLike, 

60 genbank_infile: Optional[PathLike], 

61 fasta_outfile: PathLike, 

62 peptide_mode: bool = False, 

63) -> None: 

64 """ 

65 Args: 

66 fasta_file: Input FASTA file - DNA / Protein 

67 genbank_infile: Input GenBank GBFF file (Optional) 

68 fasta_outfile: Output FASTA sequence file. 

69 peptide_mode: Process proteins instead of DNA 

70 """ 

71 file_path = Path(fasta_infile) 

72 

73 to_exclude = set() 

74 seqr_to_exclude = set(exclude_seq_regions) 

75 if peptide_mode: 

76 if genbank_infile is not None: 

77 genbank_path = Path(genbank_infile) 

78 to_exclude = get_peptides_to_exclude(genbank_path, seqr_to_exclude) 

79 else: 

80 to_exclude = seqr_to_exclude 

81 

82 # Copy and filter 

83 records = [] 

84 

85 # Final path 

86 with open_gz_file(file_path) as in_fasta: 

87 for record in SeqIO.parse(in_fasta, "fasta"): 

88 if record.id in to_exclude: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 logging.info(f"Skip record ${record.id}") 

90 else: 

91 records.append(record) 

92 with Path(fasta_outfile).open("w") as out_fasta: 

93 SeqIO.write(records, out_fasta, "fasta") 

94 

95 

96def main() -> None: 

97 """Module's entry-point.""" 

98 parser = ArgumentParser(description="Clean-up a given FASTA file to remove unwanted elements.") 

99 parser.add_argument_src_path("--fasta_infile", required=True, help="Input FASTA file - DNA / Protein") 

100 parser.add_argument_src_path("--genbank_infile", help="Input GenBank GBFF file") 

101 parser.add_argument_dst_path("--fasta_outfile", required=True, help="Output FASTA file") 

102 parser.add_argument("--peptide_mode", action="store_true", help="Process proteins instead of DNA") 

103 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__) 

104 parser.add_log_arguments(add_log_file=True) 

105 args = parser.parse_args() 

106 init_logging_with_args(args) 

107 

108 prep_fasta_data( 

109 fasta_infile=args.fasta_infile, 

110 genbank_infile=args.genbank_infile, 

111 fasta_outfile=args.fasta_outfile, 

112 peptide_mode=args.peptide_mode, 

113 )