Source code for pairk.backend.tools.esm_tools

import torch


[docs] class ESM_Model: """ This was adapted from the kibby conservation method: DOI: 10.1093/bib/bbac599. see https://github.com/esbgkannan/kibby Class that loads a specified ESM model. Provides a method for encoding protein sequences. available models: - esm1b_t33_650M_UR50S\n - esm2_t6_8M_UR50D\n - esm2_t12_35M_UR50D\n - esm2_t30_150M_UR50D\n - esm2_t33_650M_UR50D (default)\n - esm2_t36_3B_UR50D\n Attributes ---------- model_name : str the name of the model that was loaded. threads : int the number of threads for pytorch to use, by default 1. """ def __init__(self, model_name: str = "esm2_t33_650M_UR50D", threads: int = 1): torch.set_num_threads(threads) self._load(model_name) def _load(self, model_name): import esm self.model_name = model_name self.model, alphabet = esm.pretrained.load_model_and_alphabet(model_name) # self.model, alphabet = eval(f"esm.pretrained.{self.model_name}()") self.batch_converter = alphabet.get_batch_converter() self.model.eval() self.embed_dim = self.model._modules["layers"][0].embed_dim self.layers = sum(1 for i in self.model._modules["layers"])
[docs] def encode(self, sequence, device="cuda"): """encode a protein sequence using the loaded model. Parameters ---------- sequence : str the amino acid sequence to encode. device : str, optional whether to use a GPU via "cuda", or "cpu", by default "cuda" Returns ------- torch.Tensor sequence embedding tensor """ try: torch.cuda.empty_cache() batch_labels, batch_strs, batch_tokens = self.batch_converter( [("", sequence)] ) batch_tokens = batch_tokens.to(device) self.model = self.model.to(device) with torch.no_grad(): results = self.model( batch_tokens, repr_layers=[self.layers], return_contacts=False ) results = results["representations"][self.layers].to("cpu")[0] return results except Exception as e: if device != "cpu": print(f"failed with {device}, trying cpu...") return self.encode(sequence, device="cpu") else: raise e
# def encode_multiple_seqs(self, sequences: list[str], device="cuda"): # """encode a protein sequence using the loaded model. # Parameters # ---------- # sequences : list[str] # list of the amino acid sequences to encode. # device : str, optional # whether to use a GPU via "cuda", or "cpu", by default "cuda" # Returns # ------- # torch.Tensor # sequence embedding tensor # """ # try: # torch.cuda.empty_cache() # batch_labels, batch_strs, batch_tokens = self.batch_converter( # [(f"seq_{i}", seq) for i, seq in enumerate(sequences)] # ) # batch_tokens = batch_tokens.to(device) # self.model = self.model.to(device) # with torch.no_grad(): # results = self.model( # batch_tokens, repr_layers=[self.layers], return_contacts=False # ) # results = results["representations"][self.layers].to("cpu") # return results # except Exception as e: # if device != "cpu": # print(f"failed with {device}, trying cpu...") # return self.encode_multiple_seqs(sequences, device="cpu") # else: # raise e