Source code for pairk.backend.tools.esm_tools

import torch



[docs]
class ESM_Model:
    """
    This was adapted from the kibby conservation method: DOI: 10.1093/bib/bbac599.
    see https://github.com/esbgkannan/kibby

    Class that loads a specified ESM model. Provides a method for encoding protein sequences.

    available models:
    - esm1b_t33_650M_UR50S\n
    - esm2_t6_8M_UR50D\n
    - esm2_t12_35M_UR50D\n
    - esm2_t30_150M_UR50D\n
    - esm2_t33_650M_UR50D (default)\n
    - esm2_t36_3B_UR50D\n


    Attributes
    ----------
    model_name : str
        the name of the model that was loaded.
    threads : int
        the number of threads for pytorch to use, by default 1.

    """

    def __init__(self, model_name: str = "esm2_t33_650M_UR50D", threads: int = 1):
        torch.set_num_threads(threads)
        self._load(model_name)

    def _load(self, model_name):
        import esm

        self.model_name = model_name
        self.model, alphabet = esm.pretrained.load_model_and_alphabet(model_name)
        # self.model, alphabet = eval(f"esm.pretrained.{self.model_name}()")
        self.batch_converter = alphabet.get_batch_converter()
        self.model.eval()
        self.embed_dim = self.model._modules["layers"][0].embed_dim
        self.layers = sum(1 for i in self.model._modules["layers"])


[docs]
    def encode(self, sequence, device="cuda"):
        """encode a protein sequence using the loaded model.

        Parameters
        ----------
        sequence : str
            the amino acid sequence to encode.
        device : str, optional
            whether to use a GPU via "cuda", or "cpu", by default "cuda"

        Returns
        -------
        torch.Tensor
            sequence embedding tensor
        """

        try:
            torch.cuda.empty_cache()

            batch_labels, batch_strs, batch_tokens = self.batch_converter(
                [("", sequence)]
            )
            batch_tokens = batch_tokens.to(device)
            self.model = self.model.to(device)
            with torch.no_grad():
                results = self.model(
                    batch_tokens, repr_layers=[self.layers], return_contacts=False
                )
                results = results["representations"][self.layers].to("cpu")[0]
            return results
        except Exception as e:
            if device != "cpu":
                print(f"failed with {device}, trying cpu...")
                return self.encode(sequence, device="cpu")
            else:
                raise e



    # def encode_multiple_seqs(self, sequences: list[str], device="cuda"):
    #     """encode a protein sequence using the loaded model.

    #     Parameters
    #     ----------
    #     sequences : list[str]
    #         list of the amino acid sequences to encode.
    #     device : str, optional
    #         whether to use a GPU via "cuda", or "cpu", by default "cuda"

    #     Returns
    #     -------
    #     torch.Tensor
    #         sequence embedding tensor
    #     """

    #     try:
    #         torch.cuda.empty_cache()

    #         batch_labels, batch_strs, batch_tokens = self.batch_converter(
    #             [(f"seq_{i}", seq) for i, seq in enumerate(sequences)]
    #         )
    #         batch_tokens = batch_tokens.to(device)
    #         self.model = self.model.to(device)
    #         with torch.no_grad():
    #             results = self.model(
    #                 batch_tokens, repr_layers=[self.layers], return_contacts=False
    #             )
    #             results = results["representations"][self.layers].to("cpu")

    #         return results
    #     except Exception as e:
    #         if device != "cpu":
    #             print(f"failed with {device}, trying cpu...")
    #             return self.encode_multiple_seqs(sequences, device="cpu")
    #         else:
    #             raise e