Documentation for the `tokenizers` module

`scprint.tokenizers.protein_embedder`

`PROTBERT`

PROTBERT a ghost class to call protein LLMs to encode protein sequences.

Parameters:	`config` (`str`, default: `'esm-extract'` ) – The configuration for the model. Defaults to "esm-extract". `pretrained_model` (`str`, default: `'esm2_t33_650M_UR50D'` ) – The pretrained model to be used. Defaults to "esm2_t33_650M_UR50D".

Source code in scprint/tokenizers/protein_embedder.py

def __init__(
    self,
    config: str = "esm-extract",
    pretrained_model: str = "esm2_t33_650M_UR50D",
):
    """
    PROTBERT a ghost class to call protein LLMs to encode protein sequences.

    Args:
        config (str, optional): The configuration for the model. Defaults to "esm-extract".
        pretrained_model (str, optional): The pretrained model to be used. Defaults to "esm2_t33_650M_UR50D".
    """
    self.config = config
    self.pretrained_model = pretrained_model

`call`

Call the PROTBERT model on the input file.

Parameters:	`input_file` (`str`) – The input file to be processed. `output_folder` (`str`, default: `'/tmp/esm_out/'` ) – The folder where the output will be stored. Defaults to "/tmp/esm_out/". `cache` (`bool`, default: `True` ) – If True, use cached data if available. Defaults to True.

Returns:	`DataFrame` – pd.DataFrame: The results of the model as a DataFrame.

Source code in scprint/tokenizers/protein_embedder.py

def __call__(
    self, input_file: str, output_folder: str = "/tmp/esm_out/", cache: bool = True
) -> pd.DataFrame:
    """
    Call the PROTBERT model on the input file.

    Args:
        input_file (str): The input file to be processed.
        output_folder (str, optional): The folder where the output will be stored. Defaults to "/tmp/esm_out/".
        cache (bool, optional): If True, use cached data if available. Defaults to True.

    Returns:
        pd.DataFrame: The results of the model as a DataFrame.
    """
    if not os.path.exists(output_folder) or not cache:
        os.makedirs(output_folder, exist_ok=True)
        print("running protbert")
        cmd = (
            self.config
            + " "
            + self.pretrained_model
            + " "
            + input_file
            + " "
            + output_folder
            + " --include mean"
        )
        try:
            run_command(cmd, shell=True)
        except Exception as e:
            raise RuntimeError(
                "An error occurred while running the esm-extract command: " + str(e)
            )
    return self.read_results(output_folder)

`read_results`

Read multiple .pt files in a folder and convert them into a DataFrame.

Parameters:	`output_folder` (`str`) – The folder where the .pt files are stored.

Returns:	– pd.DataFrame: The results of the model as a DataFrame.

Source code in scprint/tokenizers/protein_embedder.py

def read_results(self, output_folder):
    """
    Read multiple .pt files in a folder and convert them into a DataFrame.

    Args:
        output_folder (str): The folder where the .pt files are stored.

    Returns:
        pd.DataFrame: The results of the model as a DataFrame.
    """
    files = os.listdir(output_folder)
    files = [i for i in files if i.endswith(".pt")]
    results = []
    for file in files:
        results.append(
            load(output_folder + file)["mean_representations"][33].numpy().tolist()
        )
    return pd.DataFrame(data=results, index=[file.split(".")[0] for file in files])

`scprint.tokenizers.embedder`

`protein_embeddings_generator`

protein_embeddings_generator embed a set of genes using fasta file and LLMs

Parameters:

genedf (DataFrame) –

A DataFrame containing gene information.
organism (str, default: 'homo_sapiens' ) –

The organism to which the genes belong. Defaults to "homo_sapiens".
cache (bool, default: True ) –

If True, the function will use cached data if available. Defaults to True.
fasta_path (str, default: '/tmp/data/fasta/' ) –

The path to the directory where the fasta files are stored. Defaults to "/tmp/data/fasta/".
embedding_size (int, default: 512 ) –

The size of the embeddings to be generated. Defaults to 512.

Returns:	– pd.DataFrame: Returns a DataFrame containing the protein embeddings, and the RNA embeddings.

Source code in scprint/tokenizers/embedder.py

def protein_embeddings_generator(
    genedf: pd.DataFrame,
    organism: str = "homo_sapiens",  # mus_musculus,
    cache: bool = True,
    fasta_path: str = "/tmp/data/fasta/",
    embedding_size: int = 512,
):
    """
    protein_embeddings_generator embed a set of genes using fasta file and LLMs

    Args:
        genedf (pd.DataFrame): A DataFrame containing gene information.
        organism (str, optional): The organism to which the genes belong. Defaults to "homo_sapiens".
        cache (bool, optional): If True, the function will use cached data if available. Defaults to True.
        fasta_path (str, optional): The path to the directory where the fasta files are stored. Defaults to "/tmp/data/fasta/".
        embedding_size (int, optional): The size of the embeddings to be generated. Defaults to 512.

    Returns:
        pd.DataFrame: Returns a DataFrame containing the protein embeddings, and the RNA embeddings.
    """
    # given a gene file and organism
    # load the organism fasta if not already done
    utils.load_fasta_species(species=organism, output_path=fasta_path, cache=cache)
    # subset the fasta
    fasta_file = next(
        file for file in os.listdir(fasta_path) if file.endswith(".all.fa.gz")
    )
    protgenedf = genedf[genedf["biotype"] == "protein_coding"]
    utils.utils.run_command(["gunzip", fasta_path + fasta_file])
    utils.subset_fasta(
        protgenedf.index.tolist(),
        subfasta_path=fasta_path + "subset.fa",
        fasta_path=fasta_path + fasta_file[:-3],
        drop_unknown_seq=True,
    )
    # subset the gene file
    # embed
    prot_embedder = PROTBERT()
    prot_embeddings = prot_embedder(
        fasta_path + "subset.fa", output_folder=fasta_path + "esm_out/", cache=cache
    )
    # load the data and erase / zip the rest
    utils.utils.run_command(["gzip", fasta_path + fasta_file[:-3]])
    # return the embedding and gene file
    # TODO: to redebug
    # do the same for RNA
    # rnagenedf = genedf[genedf["biotype"] != "protein_coding"]
    # fasta_file = next(
    #    file for file in os.listdir(fasta_path) if file.endswith(".ncrna.fa.gz")
    # )
    # utils.utils.run_command(["gunzip", fasta_path + fasta_file])
    # utils.subset_fasta(
    #    rnagenedf["ensembl_gene_id"].tolist(),
    #    subfasta_path=fasta_path + "subset.ncrna.fa",
    #    fasta_path=fasta_path + fasta_file[:-3],
    #    drop_unknown_seq=True,
    # )
    # rna_embedder = RNABERT()
    # rna_embeddings = rna_embedder(fasta_path + "subset.ncrna.fa")
    ## Check if the sizes of the cembeddings are not the same
    # utils.utils.run_command(["gzip", fasta_path + fasta_file[:-3]])
    #
    m = AdaptiveAvgPool1d(embedding_size)
    prot_embeddings = pd.DataFrame(
        data=m(torch.tensor(prot_embeddings.values)), index=prot_embeddings.index
    )
    # rna_embeddings = pd.DataFrame(
    #    data=m(torch.tensor(rna_embeddings.values)), index=rna_embeddings.index
    # )
    # Concatenate the embeddings
    return prot_embeddings  # pd.concat([prot_embeddings, rna_embeddings])

Documentation for the tokenizers module

scprint.tokenizers.protein_embedder

PROTBERT

__call__

read_results

scprint.tokenizers.embedder

protein_embeddings_generator

Documentation for the `tokenizers` module

`scprint.tokenizers.protein_embedder`

`PROTBERT`

`call`

`read_results`

`scprint.tokenizers.embedder`

`protein_embeddings_generator`