Documentation for the tokenizers module

scprint.tokenizers.protein_embedder

Classes:

Name Description
ESM2

ESM2

ESM2 a ghost class to call protein LLMs to encode protein sequences.

Parameters:
  • config (str, default: 'esm-extract' ) –

    The configuration for the model. Defaults to "esm-extract".

  • pretrained_model (str, default: 'esm2_t33_650M_UR50D' ) –

    The pretrained model to be used. Defaults to "esm2_t33_650M_UR50D".

Methods:

Name Description
__call__

Call the ESM2 model on the input file.

read_results

Read multiple .pt files in a folder and convert them into a DataFrame.

Source code in scprint/tokenizers/protein_embedder.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
def __init__(
    self,
    config: str = "esm-extract",
    pretrained_model: str = "esm2_t33_650M_UR50D",
):
    """
    ESM2 a ghost class to call protein LLMs to encode protein sequences.

    Args:
        config (str, optional): The configuration for the model. Defaults to "esm-extract".
        pretrained_model (str, optional): The pretrained model to be used. Defaults to "esm2_t33_650M_UR50D".
    """
    self.config = config
    self.pretrained_model = pretrained_model

__call__

Call the ESM2 model on the input file.

Parameters:
  • input_file (str) –

    The input file to be processed.

  • output_folder (str, default: '/tmp/esm_out/' ) –

    The folder where the output will be stored. Defaults to "/tmp/esm_out/".

  • cache (bool, default: True ) –

    If True, use cached data if available. Defaults to True.

Returns:
  • DataFrame

    pd.DataFrame: The results of the model as a DataFrame.

Source code in scprint/tokenizers/protein_embedder.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def __call__(
    self, input_file: str, output_folder: str = "/tmp/esm_out/", cache: bool = True
) -> pd.DataFrame:
    """
    Call the ESM2 model on the input file.

    Args:
        input_file (str): The input file to be processed.
        output_folder (str, optional): The folder where the output will be stored. Defaults to "/tmp/esm_out/".
        cache (bool, optional): If True, use cached data if available. Defaults to True.

    Returns:
        pd.DataFrame: The results of the model as a DataFrame.
    """

    import pdb

    pdb.set_trace()
    if not os.path.exists(output_folder) or not cache:
        os.makedirs(output_folder, exist_ok=True)
        print("running ESM2")
        cmd = (
            self.config
            + " "
            + self.pretrained_model
            + " "
            + input_file
            + " "
            + output_folder
            + " --include mean"
        )
        try:
            run_command(cmd, shell=True)
        except Exception as e:
            raise RuntimeError(
                "An error occurred while running the esm-extract command: " + str(e)
            )
    return self.read_results(output_folder)

read_results

Read multiple .pt files in a folder and convert them into a DataFrame.

Parameters:
  • output_folder (str) –

    The folder where the .pt files are stored.

Returns:
  • pd.DataFrame: The results of the model as a DataFrame.

Source code in scprint/tokenizers/protein_embedder.py
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def read_results(self, output_folder):
    """
    Read multiple .pt files in a folder and convert them into a DataFrame.

    Args:
        output_folder (str): The folder where the .pt files are stored.

    Returns:
        pd.DataFrame: The results of the model as a DataFrame.
    """
    files = os.listdir(output_folder)
    files = [i for i in files if i.endswith(".pt")]
    results = []
    for file in files:
        results.append(
            load(output_folder + file)["mean_representations"][33].numpy().tolist()
        )
    return pd.DataFrame(data=results, index=[file.split(".")[0] for file in files])

scprint.tokenizers.embedder

Functions:

Name Description
protein_embeddings_generator

protein_embeddings_generator embed a set of genes using fasta file and LLMs

protein_embeddings_generator

protein_embeddings_generator embed a set of genes using fasta file and LLMs

Parameters:
  • genedf (DataFrame, default: None ) –

    A DataFrame containing gene information.

  • organism (str, default: 'homo_sapiens' ) –

    The organism to which the genes belong. Defaults to "homo_sapiens".

  • cache (bool, default: True ) –

    If True, the function will use cached data if available. Defaults to True.

  • fasta_path (str, default: '/tmp/data/fasta/' ) –

    The path to the directory where the fasta files are stored. Defaults to "/tmp/data/fasta/".

  • embedding_size (int, default: 512 ) –

    The size of the embeddings to be generated. Defaults to 512.

Returns: pd.DataFrame: Returns a DataFrame containing the protein embeddings. pd.DataFrame: Returns the naming dataframe.

Source code in scprint/tokenizers/embedder.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
def protein_embeddings_generator(
    genedf: pd.DataFrame = None,
    organism: str = "homo_sapiens",  # mus_musculus,
    cache: bool = True,
    fasta_path: str = "/tmp/data/fasta/",
    embedding_size: int = 512,
    embedder: str = "esm3",  # or glm2
    cuda: bool = True,
):
    """
    protein_embeddings_generator embed a set of genes using fasta file and LLMs

    Args:
        genedf (pd.DataFrame): A DataFrame containing gene information.
        organism (str, optional): The organism to which the genes belong. Defaults to "homo_sapiens".
        cache (bool, optional): If True, the function will use cached data if available. Defaults to True.
        fasta_path (str, optional): The path to the directory where the fasta files are stored. Defaults to "/tmp/data/fasta/".
        embedding_size (int, optional): The size of the embeddings to be generated. Defaults to 512.
    Returns:
        pd.DataFrame: Returns a DataFrame containing the protein embeddings.
        pd.DataFrame: Returns the naming dataframe.
    """
    # given a gene file and organism
    # load the organism fasta if not already done
    fasta_path_pep, fasta_path_ncrna = utils.load_fasta_species(
        species=organism, output_path=fasta_path, cache=cache
    )
    # subset the fasta
    fasta_name = fasta_path_pep.split("/")[-1]
    utils.utils.run_command(["gunzip", fasta_path_pep])
    protgenedf = (
        genedf[genedf["biotype"] == "protein_coding"] if genedf is not None else None
    )
    found, naming_df = utils.subset_fasta(
        protgenedf.index.tolist() if protgenedf is not None else None,
        subfasta_path=fasta_path + "subset.fa",
        fasta_path=fasta_path + fasta_name[:-3],
        drop_unknown_seq=True,
    )
    if embedder == "esm2":
        prot_embedder = ESM2()
        prot_embeddings = prot_embedder(
            fasta_path + "subset.fa", output_folder=fasta_path + "esm_out/", cache=cache
        )
    elif embedder == "esm3":
        from Bio import SeqIO
        from esm.models.esmc import ESMC
        from esm.sdk.api import ESMProtein, LogitsConfig

        prot_embeddings = []
        names = []
        client = ESMC.from_pretrained("esmc_600m").to("cuda" if cuda else "cpu")
        conf = LogitsConfig(sequence=True, return_embeddings=True)
        with (
            open(fasta_path + "subset.fa", "r") as fasta,
        ):
            for record in tqdm(SeqIO.parse(fasta, "fasta")):
                protein = ESMProtein(sequence=str(record.seq))
                protein_tensor = client.encode(protein)
                logits_output = client.logits(protein_tensor, conf)
                prot_embeddings.append(
                    logits_output.embeddings[0].mean(0).cpu().numpy().tolist()
                )
                names.append(record.id)
    else:
        raise ValueError(f"Embedder {embedder} not supported")
    # load the data and erase / zip the rest
    # utils.utils.run_command(["gzip", fasta_path + fasta_name[:-3]])
    # return the embedding and gene file
    # TODO: to redebug
    # do the same for RNA
    # rnagenedf = genedf[genedf["biotype"] != "protein_coding"]
    # fasta_file = next(
    #    file for file in os.listdir(fasta_path) if file.endswith(".ncrna.fa.gz")
    # )
    # utils.utils.run_command(["gunzip", fasta_path + fasta_file])
    # utils.subset_fasta(
    #    rnagenedf["ensembl_gene_id"].tolist(),
    #    subfasta_path=fasta_path + "subset.ncrna.fa",
    #    fasta_path=fasta_path + fasta_file[:-3],
    #    drop_unknown_seq=True,
    # )
    # rna_embedder = RNABERT()
    # rna_embeddings = rna_embedder(fasta_path + "subset.ncrna.fa")
    ## Check if the sizes of the cembeddings are not the same
    # utils.utils.run_command(["gzip", fasta_path + fasta_file[:-3]])
    #
    m = AdaptiveAvgPool1d(embedding_size)
    prot_embeddings = pd.DataFrame(
        data=m(torch.tensor(np.array(prot_embeddings))), index=names
    )
    # rna_embeddings = pd.DataFrame(
    #    data=m(torch.tensor(rna_embeddings.values)), index=rna_embeddings.index
    # )
    # Concatenate the embeddings
    return prot_embeddings, naming_df  # pd.concat([prot_embeddings, rna_embeddings])