Source code for sc_toolbox.tools

from __future__ import annotations

import os
from typing import List

from pandas import Categorical
from statsmodels.stats.multitest import fdrcorrection

try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal  # type: ignore

from typing import Optional

import numpy as np
import pandas as pd
import scanpy as sc
from anndata import AnnData
from rich import print

WORKING_DIRECTORY = os.path.dirname(__file__)


[docs]def generate_expression_table(
    adata,
    cluster: str = "all",
    subset_by: str = "cell_type",
    xlabel: str = None,
    condition: str = None,
    use_raw: bool = None,
):
    """Generates a table of cells by genes of expression values as a Pandas DataFrame.

    Args:
        adata: Anndata object
        cluster: Which label of the subsets to generate the table for. Use 'all' if for all subsets.
        subset_by: Which label to subset the clusters by
        xlabel: Label that will be used for subsequent line plots as x-axis label. Typically a time series such as "days".
        condition: Column name of the condition to include.
        use_raw: Whether to use adata.raw.X for the calculations

    Returns:
         Gene expression table.
    """
    if cluster == "all":
        cells = adata.obs_names
    else:
        cells = [True if val in cluster else False for val in adata.obs[subset_by]]
    if use_raw:
        gen_expression_table = pd.DataFrame(
            adata[cells].raw.X.todense(), index=adata[cells].obs_names, columns=adata[cells].raw.var_names
        )
    else:
        gen_expression_table = pd.DataFrame(
            adata[cells].X, index=adata[cells].obs_names, columns=adata[cells].var_names
        )

    gen_expression_table["identifier"] = adata[cells].obs["identifier"]
    if xlabel:
        gen_expression_table[xlabel] = adata[cells].obs[xlabel]
    if condition:
        # For multiple cluster, split internally per condition
        if isinstance(cluster, list) and len(cluster) > 1 and subset_by != condition:
            gen_expression_table[condition] = [
                f"{t}_{c}" for t, c in zip(adata[cells].obs[condition], adata[cells].obs[subset_by])
            ]

        else:
            gen_expression_table[condition] = adata[cells].obs[condition]

    return gen_expression_table


[docs]def relative_frequencies(adata, group_by: str = "cell_type", xlabel: str = "days", condition: str = "batch"):
    """Calculates the relative frequencies of conditions grouped by an observation.

    Args:
        adata: AnnData Objet containing the data
        group_by: Column name to group by
        xlabel: x-axis label
        condition:

    Returns:
        Relative frequencies in a Pandas DataFrame
    """
    freqs = adata.obs.groupby(["identifier", group_by]).size()
    samples = np.unique(adata.obs["identifier"])
    ind = adata.obs[group_by].cat.categories

    relative_frequencies = [freqs[ident] / sum(freqs[ident]) for ident in samples]
    relative_frequencies = pd.DataFrame(relative_frequencies, columns=ind, index=samples).fillna(0)

    # relFreqs[xlabel] = grouping.loc[samples, xlabel]  ## when using Grouping Table
    cell_types = {}
    combis = adata.obs.groupby(["identifier", xlabel]).groups.keys()

    for c in combis:
        cell_types[c[0]] = c[1]
    relative_frequencies[xlabel] = [cell_types[label] for label in relative_frequencies.index]  # type: ignore

    # Todo, add for condition
    if condition:
        combis = adata.obs.groupby(["identifier", condition]).groups.keys()
        for c in combis:
            cell_types[c[0]] = c[1]
        relative_frequencies[condition] = [cell_types[label] for label in relative_frequencies.index]  # type: ignore

    return relative_frequencies


[docs]def relative_frequency_per_cluster(adata, group_by: str = "cell_type", xlabel: str = "days", condition=None):
    """
    Calculates relative frequencies per cluster

    Args:
        adata: AnnData object containing the data
        group_by: The label to group by for the clusters
        xlabel: x-axis label
        condition: condition to combine by

    Returns:
        Pandas DataFrame of relative frequencies
    """
    frequencies = adata.obs.groupby([group_by, xlabel]).size()
    celltypes = np.unique(adata.obs[group_by])
    ind = adata.obs[xlabel].cat.categories

    relative_frequencies = [frequencies[ident] / sum(frequencies[ident]) for ident in celltypes]
    relative_frequencies = pd.DataFrame(relative_frequencies, columns=ind, index=celltypes).fillna(0)

    cell_types = {}
    combinations = adata.obs.groupby([group_by, xlabel]).groups.keys()

    for combination in combinations:
        cell_types[combination[0]] = combination[1]
    relative_frequencies[group_by] = relative_frequencies.index  # type: ignore

    # Todo, add for condition
    if condition:
        combinations = adata.obs.groupby([group_by, condition]).groups.keys()
        for combination in combinations:
            cell_types[combination[0]] = combination[1]
        relative_frequencies[condition] = [cell_types[label] for label in relative_frequencies.index]  # type: ignore

    return relative_frequencies


[docs]def correlate_to_signature(
    adata,
    marker: pd.DataFrame,
    log_fc_threshold: float = 0.7,
    cell_type: str = "AT2 cells",
    cell_type_label: str = "cell_type",
    log_fc_label: str = "logfoldchange",
    gene_label: str = "gene",
    use_raw: bool = True,
):
    """
    Correlations Score (based on cell type signature (logFC)) - alternative to sc.tl.score

    Args:
        adata: AnnData object containing the data
        marker: Pandas DataFrame containing marker genes
        log_fc_threshold: Log fold change label
        cell_type: Cell type to calculate the correlation for
        cell_type_label: Label of all cell types in the AnnData object
        log_fc_label: Label of fold change in the AnnData object
        gene_label: Label of genes in the AnnData object
        use_raw: Whether to use adata.raw.X

    Returns:
        List of correlations
    """
    from scipy.sparse import issparse

    topmarker = marker[marker.loc[:, cell_type_label] == cell_type]
    topmarker = topmarker.loc[topmarker.loc[:, log_fc_label] > log_fc_threshold, [gene_label, log_fc_label]]

    gene_names = list(np.intersect1d(adata.var_names, topmarker.loc[:, gene_label].astype(str)))
    topmarker = topmarker[topmarker.loc[:, gene_label].isin(gene_names)]
    print(f"[bold blue]{len(gene_names)} genes used for correlation score to {cell_type}")

    if use_raw:
        if issparse(adata.raw.X):
            gene_expression = adata.raw[:, gene_names].X.todense()
        else:
            gene_expression = adata.raw[:, gene_names].X
    else:
        if issparse(adata.X):
            gene_expression = adata[:, gene_names].X.todense()
        else:
            gene_expression = adata[:, gene_names].X
    gene_expression = pd.DataFrame(gene_expression.T, index=gene_names)

    # For each cell separately
    gene_expression = pd.DataFrame.fillna(gene_expression, value=0)
    res = [
        np.correlate(topmarker.loc[:, log_fc_label], gene_expression.iloc[:, c])[0]
        for c in range(gene_expression.shape[1])
    ]

    return res


[docs]def remove_outliers(cords, eps: int = 1, min_samples: int = 2) -> Categorical:
    """Remove outlying cells based on UMAP embeddings with DBScan (density based clustering).

    Call as: sub.obs["d_cluster"] = remove_outliers(sub.obsm["X_umap"], min_samples = 10)

    Args:
        cords: adata UMAP coordinates, typically adata.obsm["X_umap"]
        eps: Maximum distance between two clusters to still be considered neighbors
        min_samples: Minimum samples of a cluster

    Returns:
        Pandas Categorical of clusters
    """
    from natsort import natsorted
    from sklearn.cluster import DBSCAN

    clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(cords)
    cluster = clustering.labels_.astype("U")

    return pd.Categorical(cluster, categories=natsorted(np.unique(cluster)))


[docs]def add_percentages(adata, table, ids, group_by: str, threshold: int = 0, gene_label: str = "gene"):
    """Add columns to existing diffxpy table specifying percentage of expressing cells.

    Args:
        adata: AnnData object containing the data
        table: Table as generated by diffxpy
        ids: Identifiers to add percentages for.
        group_by: Label to group by
        threshold: Cell count threshold.
        gene_label: Label of the genes

    Returns:
        Table containing percentage of expressing cells
    """
    for ident in ids:
        cells = adata.obs_names[adata.obs[group_by] == ident]
        data_temp = pd.DataFrame(
            ((adata[cells].layers["counts"] > threshold).sum(0) / adata[cells].layers["counts"].shape[0]).T,
            index=adata.var_names,
        )

        if gene_label == "index":
            table[f"pct.{ident}s"] = data_temp.reindex(table.index.values).values
        else:
            table[f"pct.{ident}s"] = data_temp.reindex(table.loc[:, gene_label]).values
    return table


[docs]def ranksums_between_groups(
    table, id1: str = "bystander", id2: str = "infected", xlabel: str = "condition", cells=None, score: str = "Axin2"
):
    """
    Perform Wilcoxon Rank-sum test between two groups.

    Args:
        table:
        id1:
        id2:
        xlabel: x-axis label
        cells:
        score:

    Returns:
        Pandas DataFrame containing test statistic and p-value
    """
    from scipy import stats

    if cells is not None:
        table = table.loc[cells].copy()
    group1 = table[table.loc[:, xlabel] == id1].copy()
    group2 = table[table.loc[:, xlabel] == id2].copy()

    t, p = stats.ranksums(group1.loc[:, score], group2.loc[:, score])
    result = pd.DataFrame(columns=["wilcoxon_ranksum", "pval"])
    result.loc[0] = [t, p]

    return result


[docs]def generate_count_object(
    adata,
    hue: str = "disease",
    cell_type_label: str = "cell_type",
    cell_type: List[str] = None,
    min_samples: int = 2,
    min_cells: int = 5,
    ref: str = "healthy",
    subset: List[str] = None,
    layer: str = "counts",
    outliers_removal: bool = False,
):
    """
    @Meshal what is this really supposed to do?

    Args:
        adata: AnnData object
        hue: Value to color by
        cell_type_label: Label containing cell types
        cell_type: Cells type to generate counts for
        min_samples: Minimum samples for outlier removal with DBScan
        min_cells: Minimal number of cells
        ref:
        subset:
        layer:
        outliers_removal: Whether to remove outliers or not

    Returns:
        AnnData object containing counts

    Example Call:
    subset = ['3d PI-KO', '3d PI-WT']

    raw_counts = generate_count_object(adata,
                                       condition = "grouping",
                                       cell_type_label = "celltype_refined", cell_type = ["AT2"],
                                       ref = "3d PI-WT",
                                       subset = subset)
    """
    adata_subset = adata[adata.obs.grouping.isin(subset)]
    cells = [
        True if (adata_subset.obs[cell_type_label][i] in cell_type) else False  # type: ignore
        for i in range(adata_subset.n_obs)
    ]

    # Raw count data for diffxpy
    obs = adata_subset[cells].obs.copy()
    var = adata_subset.var_names.copy()

    adata_raw = sc.AnnData(X=adata_subset[cells].layers[layer].copy())
    adata_raw.obs = obs
    adata_raw.var.index = var
    adata_raw.obsm = adata_subset[cells].obsm.copy()

    # Also automate tidy up with DBScan :)
    if outliers_removal:
        adata_raw.obs["dcluster"] = remove_outliers(adata_raw.obsm["X_umap"], min_samples=min_samples)
        sc.pl.umap(adata_raw, color=[hue, "dcluster"])
        adata_raw = adata_raw[adata_raw.obs.dcluster == "0"].copy()

    sc.pp.filter_genes(adata_raw, min_cells=min_cells)

    # Set reference as first column
    adata_raw.obs.loc[:, hue].cat.reorder_categories([ref, np.setdiff1d(subset, ref)[0]], inplace=True)
    pal = adata_subset.uns[f"{hue}_colors"]
    sc.pl.umap(adata_raw, color=[hue], palette=list(pal))

    return adata_raw


[docs]def tidy_de_table(de_test, adata, cells, ids=None, qval_thresh: float = 0.9, group_by: str = "treatment", cols=None):
    """
    Sorts diffxpy de table and adds percentages of expression per group

    Args:
        de_test: diffxpy de test
        adata: AnnData object
        cells:
        ids:
        qval_thresh:
        group_by:
        cols:

    Returns:
        Pandas Dataframe of diffxpy table with percentages
    """
    result = de_test.summary().sort_values(by=["qval"], ascending=True)
    result = result[result.qval < qval_thresh].loc[:, cols].copy()

    # Add percentages
    result = add_percentages(adata[cells], result, ids=ids, group_by=group_by)

    return result


[docs]def correlate_means_to_gene(means: pd.DataFrame, corr_gene: str = "EOMES"):
    """
    Calculate gene to gene correlation based on a mean expression table

    Args:
        means:
        corr_gene:

    Returns:
        Pandas DataFrame of correlations
    """
    import scipy.stats

    genes = means.columns.values
    cors = pd.DataFrame(index=genes, columns=["spearman_corr", "pvalue"])
    # tab = sc.get.obs_df(sub, keys = [corr_gene], layer = None, use_raw = True)
    table = means.loc[:, [corr_gene]].values

    # Loop over all genes.
    for gene in genes:
        tmp = scipy.stats.spearmanr(table, means.loc[:, [gene]])  # Spearman's rho
        cors.loc[gene, :] = tmp[0:2]

    cors.dropna(axis=0, inplace=True)
    cors.sort_values("spearman_corr", ascending=False, inplace=True)

    return cors


[docs]def extended_marker_table(
    adata: AnnData,
    qval_thresh: float = 0.05,
    cell_type_label: str = "cell_type",
    gene_ranks_key: str = "rank_genes_groups",
):
    """
    Generates an extended marker table with cell types and percentages of expressed cell types per cluster.

    Run scanpy.tl.rank_genes_groups before using this function.

    Args:
        adata: AnnData object containing ranked genes
        qval_thresh: Threshold to filter the log fold change for
        cell_type_label: Label containing all cell types
        gene_ranks_key: Key for the ranked gene groups (generated by sc.tl.rank_genes_groups)

    Returns:
        A Pandas DataFrame
    """
    result = adata.uns[gene_ranks_key]
    all_markers = []

    for cluster in result["names"].dtype.names:
        current = pd.DataFrame(
            {
                "gene": result["names"][cluster],
                "score": result["scores"][cluster],
                "log_FC": result["logfoldchanges"][cluster],
                "pval": result["pvals"][cluster],
                "pval_adj": result["pvals_adj"][cluster],
                "cell_type": cluster,
            }
        )

        # Add percentage expressed per cell type
        adata.obs["group"] = ["within" if ct == cluster else "outside" for ct in adata.obs.loc[:, cell_type_label]]
        current = add_percentages(adata, table=current, group_by="group", gene_label="gene", ids=["within", "outside"])

        all_markers.append(current)

    all_markers_df = pd.concat(all_markers)
    all_markers_df = all_markers_df[all_markers_df.pval_adj < qval_thresh].copy()

    return all_markers_df


[docs]def generate_pseudobulk(adata: AnnData, group_key: str = "identifier", sep="\t", save: str = None) -> pd.DataFrame:
    """
    Generates a pseudobulk for a given key of groups in the AnnData object.

    Looks like:

    +------------+------------------+------------------+
    | Genes      | Group Member 1   | Group Member 2   |
    +============+==================+==================+
    | Gene 1     | Value 1          | Value 2          |
    +------------+------------------+------------------+
    | Gene 2     | Value 2          | Value 3          |
    +------------+------------------+------------------+

    Args:
        adata: AnnData object
        group_key: The key to group by. E.g. by mice, by condition, ... (default: 'identifier')
        sep: Separator to use when saving the pseudobulk table (default: '\t')
        save: Path to save the pseudobulk table to (default: None)

    Returns:
        A Pandas DataFrame containing the pseudobulk table
    """
    pseudobulk = pd.DataFrame(data=adata.var_names.values, columns=["Genes"])

    for i in adata.obs.loc[:, group_key].cat.categories:
        temp = adata.obs.loc[:, group_key] == i
        pseudobulk[i] = adata[temp].X.sum(0, dtype=int)  # column sums (genes)

    if save:
        pseudobulk.to_csv(save, sep=sep, index=False)

    return pseudobulk


[docs]def automated_marker_annotation(
    adata: AnnData,
    organism: str,
    tissue: str,
    marker_file: str,
    key: str = "rank_genes_groups",
    normalize: Optional[Literal["reference", "data"]] = "reference",
    p_value: float = 0.05,
    log_fold_change: float = 2,
):
    """Calculates a marker gene overlap based on pre-existing annotations.

    Currently supported marker files:

    +------------+------------+------------------------------+
    | Organism   | Tissue     | Marker File                  |
    +============+============+==============================+
    | Mouse      | Lung       | lung_particle_markers.txt    |
    +------------+------------+------------------------------+
    | Human      | NA         |                              |
    +------------+------------+------------------------------+

    Args:
        adata: AnnData object containing ranked genes
        organism: Currently supported: 'mouse'
        tissue: Currently supported: 'lung'
        marker_file: Name of the marker file to be used - refer to table
        key: Key of ranked genes in adata (default: 'rank_genes_groups')
        normalize: Normalization option for the marker gene overlap output (default: 'reference')
        p_value: p-value threshold for existing marker genes (default: 0.05)
        log_fold_change: log fold change threshold for existing marker genes (default: 2)

    Returns:
        Pandas DataFrame of overlapping genes. Visualize with a Seaborn Heatmap
    """
    supported_organisms = {"mouse"}
    supported_tissues = {"lung"}
    supported_marker_files = {"lung_particle_markers.txt"}

    if organism not in supported_organisms:
        print(f"[bold red]Unfortunately organism {organism} is not yet supported.")
        return

    if tissue not in supported_tissues:
        print(f"[bold red]Unfortunately tissue {tissue} is not yet supported.")
        return

    if marker_file not in supported_marker_files:
        print(f"[bold red]Unfortunately marker file {marker_file} could not be found. Please check your spelling.")
        return

    marker_table = pd.read_csv(f"{WORKING_DIRECTORY}/markers/{marker_file}", sep="\t", index_col=None)
    marker_table = marker_table[
        (marker_table.logfoldchange > log_fold_change) & (marker_table.pval_adj < p_value)
    ].copy()

    marker = dict()
    for ct in marker_table["cell_type"].unique():
        tmp = marker_table[marker_table["cell_type"] == ct]
        marker[ct] = tmp.gene.values

    return sc.tl.marker_gene_overlap(adata, marker, key=key, normalize=normalize)


[docs]def de_res_to_anndata(
    adata: AnnData,
    de_res: pd.DataFrame,
    *,
    groupby: str,
    gene_id_col: str = "gene_symbol",
    score_col: str = "score",
    pval_col: str = "pvalue",
    pval_adj_col: Optional[str] = None,
    lfc_col: str = "lfc",
    key_added: str = "rank_genes_groups",
) -> None:
    """Add a tabular differential expression result to AnnData as if it was produced by scanpy.tl.rank_genes_groups.

    Args:
        adata: Annotated data matrix
        de_res: Tablular DE result as Pandas DataFrame
        groupby: Column in `de_res` that indicates the group. This column must also exist in `adata.obs`.
        gene_id_col: Column in `de_res` that holds the gene identifiers
        score_col: Column in `de_res` that holds the score (results will be ordered by score).
        pval_col: Column in `de_res` that holds the unadjusted pvalue
        pval_adj_col: Column in `de_res` that holds the adjusted pvalue.
                      If not specified, the unadjusted p values will be FDR-adjusted.
        lfc_col: Column in `de_res` that holds the log fold change
        key_added: Key under which the results will be stored in adata.uns
    """
    if groupby not in adata.obs.columns or groupby not in de_res.columns:
        raise ValueError("groupby column must exist in both adata and de_res. ")
    res_dict = {
        "params": {
            "groupby": groupby,
            "reference": "rest",
            "method": "other",
            "use_raw": True,
            "layer": None,
            "corr_method": "other",
        },
        "names": [],
        "scores": [],
        "pvals": [],
        "pvals_adj": [],
        "logfoldchanges": [],
    }
    df_groupby = de_res.groupby(groupby)
    for _, tmp_df in df_groupby:
        tmp_df = tmp_df.sort_values(score_col, ascending=False)
        res_dict["names"].append(tmp_df[gene_id_col].values)  # type: ignore
        res_dict["scores"].append(tmp_df[score_col].values)  # type: ignore
        res_dict["pvals"].append(tmp_df[pval_col].values)  # type: ignore
        if pval_adj_col is not None:
            res_dict["pvals_adj"].append(tmp_df[pval_adj_col].values)  # type: ignore
        else:
            res_dict["pvals_adj"].append(fdrcorrection(tmp_df[pval_col].values)[1])  # type: ignore
        res_dict["logfoldchanges"].append(tmp_df[lfc_col].values)  # type: ignore

    for key in ["names", "scores", "pvals", "pvals_adj", "logfoldchanges"]:
        res_dict[key] = pd.DataFrame(
            np.vstack(res_dict[key]).T,  # type: ignore
            columns=list(df_groupby.groups.keys()),
        ).to_records(index=False, column_dtypes="O")
    adata.uns[key_added] = res_dict