Coverage for src/python/ensembl/io/genomio/fasta/process.py: 83%

1# See the NOTICE file distributed with this work for additional information

2# regarding copyright ownership.

4# Licensed under the Apache License, Version 2.0 (the "License");

5# you may not use this file except in compliance with the License.

6# You may obtain a copy of the License at

8# http://www.apache.org/licenses/LICENSE-2.0

10# Unless required by applicable law or agreed to in writing, software

11# distributed under the License is distributed on an "AS IS" BASIS,

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13# See the License for the specific language governing permissions and

14# limitations under the License.

15"""Takes a FASTA file (DNA or peptide), cleans it up and optionally excludes some IDs."""

17__all__ = ["FastaParserError", "get_peptides_to_exclude", "prep_fasta_data"]

19import logging

20from pathlib import Path

21from os import PathLike

22from typing import List, Optional, Set

24from Bio import SeqIO

26import ensembl.io.genomio

27from ensembl.utils.archive import open_gz_file

28from ensembl.utils.argparse import ArgumentParser

29from ensembl.utils.logging import init_logging_with_args

32exclude_seq_regions: List[str] = []

35class FastaParserError(Exception):

36 """Error while parsing a FASTA file."""

39def get_peptides_to_exclude(genbank_path: PathLike, seqr_to_exclude: Set[str]) -> Set[str]:

40 """

41 Extract peptide IDs from a genbank file that are in a given list of seq regions

42 """

43 peptides_to_exclude: Set[str] = set()

44 with open_gz_file(genbank_path) as in_genbank:

45 for record in SeqIO.parse(in_genbank, "genbank"):

46 if record.id in seqr_to_exclude:

47 logging.info(f"Skip sequence {record.id}")

48 for feat in record.features:

49 if feat.type == "CDS":

50 if "protein_id" in feat.qualifiers:

51 feat_id = feat.qualifiers["protein_id"]

52 peptides_to_exclude.add(feat_id[0])

53 else:

54 raise FastaParserError(f"Peptide without peptide ID ${feat}")

55 return peptides_to_exclude

58def prep_fasta_data(

59 fasta_infile: PathLike,

60 genbank_infile: Optional[PathLike],

61 fasta_outfile: PathLike,

62 peptide_mode: bool = False,

63) -> None:

64 """

65 Args:

66 fasta_file: Input FASTA file - DNA / Protein

67 genbank_infile: Input GenBank GBFF file (Optional)

68 fasta_outfile: Output FASTA sequence file.

69 peptide_mode: Process proteins instead of DNA

70 """

71 file_path = Path(fasta_infile)

73 to_exclude = set()

74 seqr_to_exclude = set(exclude_seq_regions)

75 if peptide_mode:

76 if genbank_infile is not None:

77 genbank_path = Path(genbank_infile)

78 to_exclude = get_peptides_to_exclude(genbank_path, seqr_to_exclude)

79 else:

80 to_exclude = seqr_to_exclude

82 # Copy and filter

83 records = []

85 # Final path

86 with open_gz_file(file_path) as in_fasta:

87 for record in SeqIO.parse(in_fasta, "fasta"):

88 if record.id in to_exclude: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 logging.info(f"Skip record ${record.id}")

90 else:

91 records.append(record)

92 with Path(fasta_outfile).open("w") as out_fasta:

93 SeqIO.write(records, out_fasta, "fasta")

96def main() -> None:

97 """Module's entry-point."""

98 parser = ArgumentParser(description="Clean-up a given FASTA file to remove unwanted elements.")

99 parser.add_argument_src_path("--fasta_infile", required=True, help="Input FASTA file - DNA / Protein")

100 parser.add_argument_src_path("--genbank_infile", help="Input GenBank GBFF file")

101 parser.add_argument_dst_path("--fasta_outfile", required=True, help="Output FASTA file")

102 parser.add_argument("--peptide_mode", action="store_true", help="Process proteins instead of DNA")

103 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)

104 parser.add_log_arguments(add_log_file=True)

105 args = parser.parse_args()

106 init_logging_with_args(args)

107

108 prep_fasta_data(

109 fasta_infile=args.fasta_infile,

110 genbank_infile=args.genbank_infile,

111 fasta_outfile=args.fasta_outfile,

112 peptide_mode=args.peptide_mode,

113 )