Source code for pairk.backend.kmer_alignment.scoring_matrix_needleman

from Bio import Align, Seq
import copy
import pairk.backend.tools.sequence_utils as tools
import pairk.backend.tools.pairwise_tools as pairwise_tools
import pairk.backend.tools.matrices as matrices
import pairk.backend.kmer_alignment.needleman_tools as needleman_tools
import pairk.backend.exceptions as _exceptions
import pandas as pd


def run_pairwise_kmer_alignment_needleman(
    query_idr: str,
    ortholog_idrs: dict[str, str],
    k: int,
    aligner: Align.PairwiseAligner,
):
    kmers = tools.gen_kmers(query_idr, k)
    positions = list(range(len(kmers)))

    score_df = pairwise_tools.make_empty_kmer_ortho_df(
        positions, list(ortholog_idrs.keys())
    )
    orthokmer_df = pairwise_tools.make_empty_kmer_ortho_df(
        positions, list(ortholog_idrs.keys())
    )
    pos_df = pairwise_tools.make_empty_kmer_ortho_df(
        positions, list(ortholog_idrs.keys())
    )
    score_df.loc[positions, "query_kmer"] = kmers
    orthokmer_df.loc[positions, "query_kmer"] = kmers
    pos_df.loc[positions, "query_kmer"] = kmers
    for position, kmer in zip(positions, kmers):
        for ortholog_id, ortholog_idr in ortholog_idrs.items():
            if len(ortholog_idr) < k:
                orthokmer_df.loc[position, ortholog_id] = "-" * k
                continue
            best_score, best_subseq, best_pos = (
                needleman_tools.score_kmer_2_seq_needleman(kmer, ortholog_idr, aligner)
            )
            score_df.loc[position, ortholog_id] = best_score
            orthokmer_df.loc[position, ortholog_id] = best_subseq  # type: ignore
            pos_df.loc[position, ortholog_id] = best_pos
            # (
            #     reci_best_score,
            #     reci_best_subseq,
            #     _,
            # ) = needleman_tools.score_kmer_2_seq_needleman(
            #     best_subseq, query_idr, aligner  # type: ignore
            # )
    return score_df, orthokmer_df, pos_df


[docs] def pairk_alignment_needleman( idr_dict: dict[str, str], query_id: str, k: int, aligner: Align.PairwiseAligner | None = None, matrix_name: str = "EDSSMat50", ) -> pairwise_tools.PairkAln: """ run pairwise k-mer alignment method using the needleman-wunsch algorithm as implemented in Biopython. Each query k-mer is scored against each ortholog k-mer to find the best matching ortholog k-mer in each ortholog. If a ortholog IDR is shorter than the k-mer, a string of "-" characters ("-"\\*k) is assigned as the best matching ortholog k-mer for that ortholog **Note**: if there are multiple top-scoring matches, only one is returned. Parameters ---------- idr_dict : dict[str, str] input sequences in dictionary format with the key being the sequence id and the value being the sequence as a string query_id : str the id of the query sequence within the `idr_dict` dictionary k : int the length of the k-mers to use for the alignment aligner : Align.PairwiseAligner | None, optional The Biopython pairwise aligner object to use in the pairwise gapless alignments, by default None. If None, then an aligner object will be created using the scoring matrix specified in `matrix_name`. If an aligner object is provided, it will take precedence over the `matrix_name` parameter, i.e. the `matrix_name` parameter will be ignored. matrix_name : str, optional The name of the scoring matrix to use in the algorithm, by default "EDSSMat50". The available matrices can be viewed with the function `print_available_matrices()` in `pairk.backend.tools.matrices`. If an aligner object is provided, this parameter **will be ignored**. Returns ------- pairwise_tools.PairkAln an object containing the alignment results. See the `pairk.PairkAln` class for more information. """ _exceptions.check_queryid_in_idr_dict(idr_dict, query_id) idr_str_dict = copy.deepcopy(idr_dict) if aligner is None: _exceptions.validate_matrix_name(matrix_name) aligner = needleman_tools.make_aligner(matrix_name) query_idr = idr_str_dict.pop(query_id) score_df, orthokmer_df, pos_df = run_pairwise_kmer_alignment_needleman( query_idr, idr_str_dict, k, aligner=aligner, ) return pairwise_tools.PairkAln( orthokmer_df=orthokmer_df, pos_df=pos_df, score_df=score_df, )