Coverage for src/python/ensembl/io/genomio/fasta/process.py: 83%
53 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-21 15:37 +0000
1# See the NOTICE file distributed with this work for additional information
2# regarding copyright ownership.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Takes a FASTA file (DNA or peptide), cleans it up and optionally excludes some IDs."""
17__all__ = ["FastaParserError", "get_peptides_to_exclude", "prep_fasta_data"]
19import logging
20from pathlib import Path
21from os import PathLike
22from typing import List, Optional, Set
24from Bio import SeqIO
26import ensembl.io.genomio
27from ensembl.utils.archive import open_gz_file
28from ensembl.utils.argparse import ArgumentParser
29from ensembl.utils.logging import init_logging_with_args
32exclude_seq_regions: List[str] = []
35class FastaParserError(Exception):
36 """Error while parsing a FASTA file."""
39def get_peptides_to_exclude(genbank_path: PathLike, seqr_to_exclude: Set[str]) -> Set[str]:
40 """
41 Extract peptide IDs from a genbank file that are in a given list of seq regions
42 """
43 peptides_to_exclude: Set[str] = set()
44 with open_gz_file(genbank_path) as in_genbank:
45 for record in SeqIO.parse(in_genbank, "genbank"):
46 if record.id in seqr_to_exclude:
47 logging.info(f"Skip sequence {record.id}")
48 for feat in record.features:
49 if feat.type == "CDS":
50 if "protein_id" in feat.qualifiers:
51 feat_id = feat.qualifiers["protein_id"]
52 peptides_to_exclude.add(feat_id[0])
53 else:
54 raise FastaParserError(f"Peptide without peptide ID ${feat}")
55 return peptides_to_exclude
58def prep_fasta_data(
59 fasta_infile: PathLike,
60 genbank_infile: Optional[PathLike],
61 fasta_outfile: PathLike,
62 peptide_mode: bool = False,
63) -> None:
64 """
65 Args:
66 fasta_file: Input FASTA file - DNA / Protein
67 genbank_infile: Input GenBank GBFF file (Optional)
68 fasta_outfile: Output FASTA sequence file.
69 peptide_mode: Process proteins instead of DNA
70 """
71 file_path = Path(fasta_infile)
73 to_exclude = set()
74 seqr_to_exclude = set(exclude_seq_regions)
75 if peptide_mode:
76 if genbank_infile is not None:
77 genbank_path = Path(genbank_infile)
78 to_exclude = get_peptides_to_exclude(genbank_path, seqr_to_exclude)
79 else:
80 to_exclude = seqr_to_exclude
82 # Copy and filter
83 records = []
85 # Final path
86 with open_gz_file(file_path) as in_fasta:
87 for record in SeqIO.parse(in_fasta, "fasta"):
88 if record.id in to_exclude: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true
89 logging.info(f"Skip record ${record.id}")
90 else:
91 records.append(record)
92 with Path(fasta_outfile).open("w") as out_fasta:
93 SeqIO.write(records, out_fasta, "fasta")
96def main() -> None:
97 """Module's entry-point."""
98 parser = ArgumentParser(description="Clean-up a given FASTA file to remove unwanted elements.")
99 parser.add_argument_src_path("--fasta_infile", required=True, help="Input FASTA file - DNA / Protein")
100 parser.add_argument_src_path("--genbank_infile", help="Input GenBank GBFF file")
101 parser.add_argument_dst_path("--fasta_outfile", required=True, help="Output FASTA file")
102 parser.add_argument("--peptide_mode", action="store_true", help="Process proteins instead of DNA")
103 parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
104 parser.add_log_arguments(add_log_file=True)
105 args = parser.parse_args()
106 init_logging_with_args(args)
108 prep_fasta_data(
109 fasta_infile=args.fasta_infile,
110 genbank_infile=args.genbank_infile,
111 fasta_outfile=args.fasta_outfile,
112 peptide_mode=args.peptide_mode,
113 )