Documentation for the tokenizers module

scprint.tokenizers.protein_embedder

PROTBERT

PROTBERT a ghost class to call protein LLMs to encode protein sequences.

Parameters:
  • config (str, default: 'esm-extract' ) –

    The configuration for the model. Defaults to "esm-extract".

  • pretrained_model (str, default: 'esm2_t33_650M_UR50D' ) –

    The pretrained model to be used. Defaults to "esm2_t33_650M_UR50D".

Source code in scprint/tokenizers/protein_embedder.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
def __init__(
    self,
    config: str = "esm-extract",
    pretrained_model: str = "esm2_t33_650M_UR50D",
):
    """
    PROTBERT a ghost class to call protein LLMs to encode protein sequences.

    Args:
        config (str, optional): The configuration for the model. Defaults to "esm-extract".
        pretrained_model (str, optional): The pretrained model to be used. Defaults to "esm2_t33_650M_UR50D".
    """
    self.config = config
    self.pretrained_model = pretrained_model

__call__

Call the PROTBERT model on the input file.

Parameters:
  • input_file (str) –

    The input file to be processed.

  • output_folder (str, default: '/tmp/esm_out/' ) –

    The folder where the output will be stored. Defaults to "/tmp/esm_out/".

  • cache (bool, default: True ) –

    If True, use cached data if available. Defaults to True.

Returns:
  • DataFrame

    pd.DataFrame: The results of the model as a DataFrame.

Source code in scprint/tokenizers/protein_embedder.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def __call__(
    self, input_file: str, output_folder: str = "/tmp/esm_out/", cache: bool = True
) -> pd.DataFrame:
    """
    Call the PROTBERT model on the input file.

    Args:
        input_file (str): The input file to be processed.
        output_folder (str, optional): The folder where the output will be stored. Defaults to "/tmp/esm_out/".
        cache (bool, optional): If True, use cached data if available. Defaults to True.

    Returns:
        pd.DataFrame: The results of the model as a DataFrame.
    """
    if not os.path.exists(output_folder) or not cache:
        os.makedirs(output_folder, exist_ok=True)
        print("running protbert")
        cmd = (
            self.config
            + " "
            + self.pretrained_model
            + " "
            + input_file
            + " "
            + output_folder
            + " --include mean"
        )
        try:
            run_command(cmd, shell=True)
        except Exception as e:
            raise RuntimeError(
                "An error occurred while running the esm-extract command: " + str(e)
            )
    return self.read_results(output_folder)

read_results

Read multiple .pt files in a folder and convert them into a DataFrame.

Parameters:
  • output_folder (str) –

    The folder where the .pt files are stored.

Returns:
  • pd.DataFrame: The results of the model as a DataFrame.

Source code in scprint/tokenizers/protein_embedder.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
def read_results(self, output_folder):
    """
    Read multiple .pt files in a folder and convert them into a DataFrame.

    Args:
        output_folder (str): The folder where the .pt files are stored.

    Returns:
        pd.DataFrame: The results of the model as a DataFrame.
    """
    files = os.listdir(output_folder)
    files = [i for i in files if i.endswith(".pt")]
    results = []
    for file in files:
        results.append(
            load(output_folder + file)["mean_representations"][33].numpy().tolist()
        )
    return pd.DataFrame(data=results, index=[file.split(".")[0] for file in files])

scprint.tokenizers.embedder

protein_embeddings_generator

protein_embeddings_generator embed a set of genes using fasta file and LLMs

Parameters:
  • genedf (DataFrame) –

    A DataFrame containing gene information.

  • organism (str, default: 'homo_sapiens' ) –

    The organism to which the genes belong. Defaults to "homo_sapiens".

  • cache (bool, default: True ) –

    If True, the function will use cached data if available. Defaults to True.

  • fasta_path (str, default: '/tmp/data/fasta/' ) –

    The path to the directory where the fasta files are stored. Defaults to "/tmp/data/fasta/".

  • embedding_size (int, default: 512 ) –

    The size of the embeddings to be generated. Defaults to 512.

Returns:
  • pd.DataFrame: Returns a DataFrame containing the protein embeddings, and the RNA embeddings.

Source code in scprint/tokenizers/embedder.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def protein_embeddings_generator(
    genedf: pd.DataFrame,
    organism: str = "homo_sapiens",  # mus_musculus,
    cache: bool = True,
    fasta_path: str = "/tmp/data/fasta/",
    embedding_size: int = 512,
):
    """
    protein_embeddings_generator embed a set of genes using fasta file and LLMs

    Args:
        genedf (pd.DataFrame): A DataFrame containing gene information.
        organism (str, optional): The organism to which the genes belong. Defaults to "homo_sapiens".
        cache (bool, optional): If True, the function will use cached data if available. Defaults to True.
        fasta_path (str, optional): The path to the directory where the fasta files are stored. Defaults to "/tmp/data/fasta/".
        embedding_size (int, optional): The size of the embeddings to be generated. Defaults to 512.

    Returns:
        pd.DataFrame: Returns a DataFrame containing the protein embeddings, and the RNA embeddings.
    """
    # given a gene file and organism
    # load the organism fasta if not already done
    utils.load_fasta_species(species=organism, output_path=fasta_path, cache=cache)
    # subset the fasta
    fasta_file = next(
        file for file in os.listdir(fasta_path) if file.endswith(".all.fa.gz")
    )
    protgenedf = genedf[genedf["biotype"] == "protein_coding"]
    utils.utils.run_command(["gunzip", fasta_path + fasta_file])
    utils.subset_fasta(
        protgenedf.index.tolist(),
        subfasta_path=fasta_path + "subset.fa",
        fasta_path=fasta_path + fasta_file[:-3],
        drop_unknown_seq=True,
    )
    # subset the gene file
    # embed
    prot_embedder = PROTBERT()
    prot_embeddings = prot_embedder(
        fasta_path + "subset.fa", output_folder=fasta_path + "esm_out/", cache=cache
    )
    # load the data and erase / zip the rest
    utils.utils.run_command(["gzip", fasta_path + fasta_file[:-3]])
    # return the embedding and gene file
    # TODO: to redebug
    # do the same for RNA
    # rnagenedf = genedf[genedf["biotype"] != "protein_coding"]
    # fasta_file = next(
    #    file for file in os.listdir(fasta_path) if file.endswith(".ncrna.fa.gz")
    # )
    # utils.utils.run_command(["gunzip", fasta_path + fasta_file])
    # utils.subset_fasta(
    #    rnagenedf["ensembl_gene_id"].tolist(),
    #    subfasta_path=fasta_path + "subset.ncrna.fa",
    #    fasta_path=fasta_path + fasta_file[:-3],
    #    drop_unknown_seq=True,
    # )
    # rna_embedder = RNABERT()
    # rna_embeddings = rna_embedder(fasta_path + "subset.ncrna.fa")
    ## Check if the sizes of the cembeddings are not the same
    # utils.utils.run_command(["gzip", fasta_path + fasta_file[:-3]])
    #
    m = AdaptiveAvgPool1d(embedding_size)
    prot_embeddings = pd.DataFrame(
        data=m(torch.tensor(prot_embeddings.values)), index=prot_embeddings.index
    )
    # rna_embeddings = pd.DataFrame(
    #    data=m(torch.tensor(rna_embeddings.values)), index=rna_embeddings.index
    # )
    # Concatenate the embeddings
    return prot_embeddings  # pd.concat([prot_embeddings, rna_embeddings])