Source code for pairk.utilities

from pairk.backend.tools.sequence_utils import aln_2_idr_position_map
import pairk.backend.tools.sequence_utils as _tools  # hiding this module from the user
from pathlib import Path


[docs] def fasta_MSA_to_idr_dict( alignment_file: str | Path, idr_aln_start: int, idr_aln_end: int ) -> dict[str, str]: """import a multiple sequence alignment (MSA) in fasta format and return the IDR sequences as a dictionary. uses a single slice of the MSA to define the IDRs of each sequence. Parameters ---------- alignment_file : str | Path a path to a fasta file containing a multiple sequence alignment idr_aln_start : int the start position of the IDR in the MSA idr_aln_end : int the end position of the IDR in the MSA Returns ------- dict[str, str] a dictionary with sequence IDs as keys and the IDR region as values """ faimporter = _tools.FastaImporter(alignment_file) aln = faimporter.import_as_alignment() idr_aln_list = list(aln[:, idr_aln_start : idr_aln_end + 1]) idr_dict = { i.id: str(i.seq) for i in _tools.strip_dashes_from_sequences(idr_aln_list) # type: ignore } return idr_dict # type: ignore
[docs] def fasta_MSA_to_idr_map( alignment_file: str | Path, idr_aln_start: int, idr_aln_end: int ): """import a multiple sequence alignment (MSA) from a fasta file and return a dictionary of the start and end positions of the IDRs in the unaligned sequences. The IDRs are defined by a single slice of the alignment. Parameters ---------- aln : str | Path a path to a fasta file containing a multiple sequence alignment idr_aln_start : int start position of the IDRs in the alignment idr_aln_end : int end position of the IDRs in the alignment Returns ------- dict[str, list[int]] idr_position_map: dictionary of the start and end positions of the IDRs in the unaligned sequences. The keys are the sequence ids and the values are lists of the start and end positions of the IDRs in the unaligned sequences. The start and end positions are 0-indexed. """ faimporter = _tools.FastaImporter(alignment_file) aln = faimporter.import_as_alignment() idr_map = aln_2_idr_position_map(aln, idr_aln_start, idr_aln_end + 1) return idr_map
[docs] def fasta_MSA_to_unaligned_sequences(alignment_file: str | Path) -> dict[str, str]: """import a multiple sequence alignment (MSA) from a fasta file and return the unaligned sequences as a dictionary. Parameters ---------- alignment_file : str | Path a path to a fasta file containing a multiple sequence alignment Returns ------- dict[str, str] a dictionary with sequence IDs as keys and the unaligned sequence as values """ faimporter = _tools.FastaImporter(alignment_file) aln_dict = faimporter.import_as_str_dict() unaligned_dict = {k: _tools.strip_dashes_from_str(v) for k, v in aln_dict.items()} return unaligned_dict
# generate IDR map and IDR dict from