Pipeline Functions

Lower-level functions used by the workflow classes.

Quality Control and Normalization

sc_QC

sc_QC(
    X: DataFrame,
    min_lib_size: float = 1000,
    remove_outlier_cells: bool = True,
    min_percent: float = 0.05,
    max_mito_ratio: float = 0.1,
    min_exp_avg: float = 0,
    min_exp_sum: float = 0,
) -> pd.DataFrame

main QC function in scTenifold pipelines

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	A single-cell RNAseq DataFrame (rows: genes, cols: cells)	required
`min_lib_size`	`float`	Minimum library size of cells	`1000`
`remove_outlier_cells`	`bool`	Whether the QC function will remove the outlier cells	`True`
`min_percent`	`float`	Minimum fraction of cells where the gene needs to be expressed to be included in the analysis.	`0.05`
`max_mito_ratio`	`float`	Maximum mitochondrial genes ratio included in the final df	`0.1`
`min_exp_avg`	`float`	Minimum average expression value in each gene	`0`
`min_exp_sum`	`float`	Minimum sum of expression value in each gene	`0`

Returns:

Name	Type	Description
`X_modified`	`DataFrame`	The DataFrame after QC

Source code in scTenifold/core/_QC.py

def sc_QC(X: pd.DataFrame,
          min_lib_size: float = 1000,
          remove_outlier_cells: bool = True,
          min_percent: float = 0.05,
          max_mito_ratio: float = 0.1,
          min_exp_avg: float = 0,
          min_exp_sum: float = 0) -> pd.DataFrame:
    """
    main QC function in scTenifold pipelines

    Parameters
    ----------
    X: pd.DataFrame
        A single-cell RNAseq DataFrame (rows: genes, cols: cells)
    min_lib_size: int, float, default = 1000
        Minimum library size of cells
    remove_outlier_cells: bool, default = True
        Whether the QC function will remove the outlier cells
    min_percent: float, default = 0.05
        Minimum fraction of cells where the gene needs to be expressed to be included in the analysis.
    max_mito_ratio: float, default = 0.1
        Maximum mitochondrial genes ratio included in the final df
    min_exp_avg: float, default = 0
        Minimum average expression value in each gene
    min_exp_sum: float, default = 0
        Minimum sum of expression value in each gene
    Returns
    -------
    X_modified: pd.DataFrame
        The DataFrame after QC
    """
    X = X.copy()
    outlier_coef = 1.5
    X[X < 0] = 0
    lib_size = X.sum(axis=0)
    before_s = X.shape[1]
    X = X.loc[:, lib_size > min_lib_size]
    print(f"Removed {before_s - X.shape[1]} cells with lib size < {min_lib_size}")
    if remove_outlier_cells:
        lib_size = X.sum(axis=0)
        before_s = X.shape[1]
        Q1, Q3 = lib_size.quantile([0.25, 0.75])
        interquartile_range = Q3 - Q1
        X = X.loc[:, (lib_size >= Q1 - interquartile_range * outlier_coef) &
                     (lib_size <= Q3 + interquartile_range * outlier_coef)]
        print(f"Removed {before_s - X.shape[1]} outlier cells from original data")
    mt_genes = X.index.str.upper().str.match("^MT-")
    if any(mt_genes):
        print(f"Found mitochondrial genes: {X[mt_genes].index.to_list()}")
        before_s = X.shape[1]
        mt_rates = X[mt_genes].sum(axis=0) / X.sum(axis=0)
        X = X.loc[:, mt_rates < max_mito_ratio]
        print(f"Removed {before_s - X.shape[1]} samples from original data (mt genes ratio > {max_mito_ratio})")
    else:
        warn("Mitochondrial genes were not found. Be aware that apoptotic cells may be present in your sample.")
    before_g = X.shape[0]
    X = X[(X != 0).mean(axis=1) > min_percent]
    print(f"Removed {before_g - X.shape[0]} genes expressed in less than {min_percent} of data")

    before_g = X.shape[0]
    if X.shape[1] > 500:
        X = X.loc[X.mean(axis=1) >= min_exp_avg, :]
    else:
        X = X.loc[X.sum(axis=1) >= min_exp_sum, :]
    print(f"Removed {before_g - X.shape[0]} genes with expression values: average < {min_exp_avg} or sum < {min_exp_sum}")
    return X

cpm_norm

cpm_norm(
    X: Union[ndarray, DataFrame],
) -> Union[np.ndarray, pd.DataFrame]

Counts-per-million normalize a genes-by-cells matrix.

Parameters:

Name	Type	Description	Default
`X`	`Union[ndarray, DataFrame]`	Genes-by-cells count matrix.	required

Returns:

Type	Description
Normalized matrix with the same type and shape as ``X``.

Source code in scTenifold/core/_norm.py

def cpm_norm(X: Union[np.ndarray, pd.DataFrame]) -> Union[np.ndarray, pd.DataFrame]:
    """Counts-per-million normalize a genes-by-cells matrix.

    Parameters
    ----------
    X
        Genes-by-cells count matrix.

    Returns
    -------
    Normalized matrix with the same type and shape as ``X``.
    """
    lib_size = X.sum(axis=0)
    safe_lib_size = lib_size.replace(0, np.nan) if isinstance(lib_size, pd.Series) else np.where(lib_size == 0, np.nan, lib_size)
    normalized = X * 1e6 / safe_lib_size
    if isinstance(normalized, pd.DataFrame):
        return normalized.fillna(0)
    return np.nan_to_num(normalized)

Tensor Decomposition

tensor_decomp

tensor_decomp(
    networks: ndarray,
    gene_names: Sequence[str],
    method: str = "parafac",
    n_decimal: int = 1,
    K: int = 5,
    tol: float = 1e-06,
    max_iter: int = 1000,
    random_state: int = 42,
    **kwargs,
) -> pd.DataFrame

Perform tensor decomposition on pc networks

Parameters:

Name	Type	Description	Default
`networks`	`ndarray`	Concatenated network, expected shape = (n_genes, n_genes, n_pcnets)	required
`gene_names`	`Sequence[str]`	The name of each gene in the network (order matters)	required
`method`	`str`	Tensor decomposition method, tensorly's decomposition method was used: http://tensorly.org/stable/modules/api.html#module-tensorly.decomposition	`'parafac'`
`n_decimal`	`int`	Number of decimal in the final df	`1`
`K`	`int`	Rank in parafac function	`5`
`tol`	`float`	Tolerance in the iteration	`1e-06`
`max_iter`	`int`	Number of interation	`1000`
`random_state`	`int`	Random seed used to reproduce the same result	`42`
`**kwargs`		Keyword arguments used in the decomposition function	`{}`

Returns:

Name	Type	Description
`tensor_decomp_df`	`DataFrame`	The result of tensor decomposition, expected shape = (n_genes, n_genes)

References

http://tensorly.org/stable/modules/api.html#module-tensorly.decomposition

Source code in scTenifold/core/_decomposition.py

@timer
def tensor_decomp(networks: np.ndarray,
                  gene_names: Sequence[str],
                  method: str = "parafac",
                  n_decimal: int = 1,
                  K: int = 5,
                  tol: float = 1e-6,
                  max_iter: int = 1000,
                  random_state: int = 42,
                  **kwargs) -> pd.DataFrame:
    """
    Perform tensor decomposition on pc networks

    Parameters
    ----------
    networks: np.ndarray
        Concatenated network, expected shape = (n_genes, n_genes, n_pcnets)
    gene_names: sequence of str
        The name of each gene in the network (order matters)
    method: str, default = 'parafac'
        Tensor decomposition method, tensorly's decomposition method was used:
        http://tensorly.org/stable/modules/api.html#module-tensorly.decomposition
    n_decimal: int
        Number of decimal in the final df
    K: int
        Rank in parafac function
    tol: float
        Tolerance in the iteration
    max_iter: int
        Number of interation
    random_state: int
        Random seed used to reproduce the same result
    **kwargs:
        Keyword arguments used in the decomposition function

    Returns
    -------
    tensor_decomp_df: pd.DataFrame
        The result of tensor decomposition, expected shape = (n_genes, n_genes)

    References
    ----------
    http://tensorly.org/stable/modules/api.html#module-tensorly.decomposition

    """
    # Us, est, res_hist = cp_als(networks, n_components=K, max_iter=max_iter, tol=tol)
    # print(est.shape, len(Us), res_hist)
    print("Using tensorly")
    factors = getattr(decomposition, method)(networks, rank=K, n_iter_max=max_iter, tol=tol,
                                             random_state=random_state, **kwargs)
    estimate = tl.cp_to_tensor(factors)
    print(estimate.shape)
    out = np.sum(estimate, axis=-1) / networks.shape[-1]
    out = np.round(out / np.max(abs(out)), n_decimal)
    return pd.DataFrame(out, index=gene_names, columns=gene_names)

Knockout Internals

ko_propagation

ko_propagation(
    B: ndarray,
    x: ndarray,
    ko_gene_id: Sequence[int],
    degree: int = 1,
) -> np.ndarray

Propagate a gene knockout through an adjacency matrix.

Parameters:

Name	Type	Description	Default
`B`	`ndarray`	Adjacency matrix (genes x genes); diagonal is zeroed in-place on a copy.	required
`x`	`ndarray`	Expression matrix (genes x cells).	required
`ko_gene_id`	`Sequence[int]`	Index of the gene to knock out.	required
`degree`	`int`	Maximum propagation depth.	`1`

Returns:

Type	Description
`Knocked-out expression matrix (non-negative).`

Source code in scTenifold/core/_ko.py

def ko_propagation(B: np.ndarray,
                   x: np.ndarray,
                   ko_gene_id: Sequence[int],
                   degree: int = 1) -> np.ndarray:
    """Propagate a gene knockout through an adjacency matrix.

    Parameters
    ----------
    B
        Adjacency matrix (genes x genes); diagonal is zeroed in-place on a copy.
    x
        Expression matrix (genes x cells).
    ko_gene_id
        Index of the gene to knock out.
    degree
        Maximum propagation depth.

    Returns
    -------
    Knocked-out expression matrix (non-negative).
    """
    adj_mat = B.copy()
    np.fill_diagonal(adj_mat, 0)
    x_ko = x.copy()
    perturb = np.zeros(shape=x.shape)
    perturb[ko_gene_id, :] = x[ko_gene_id, :]
    is_visited = np.zeros(x_ko.shape[0], dtype=bool)
    x_ko = x_ko - perturb
    for d in range(degree):
        if not is_visited.all():
            perturb = adj_mat @ perturb
            new_visited = (perturb != 0).any(axis=1)
            adj_mat[is_visited, :] = 0
            adj_mat[:, is_visited] = 0
            is_visited = is_visited | new_visited
            x_ko = x_ko - perturb
    return np.where(x_ko >= 0, x_ko, 0)

reconstruct_pcnets

reconstruct_pcnets(
    nets: List[coo_matrix],
    X_df: DataFrame,
    ko_gene_id: Sequence[int],
    degree: int = 1,
    **kwargs,
) -> List[np.ndarray]

Rebuild PC networks from knocked-out expression for each input net.

Parameters:

Name	Type	Description	Default
`nets`	`List[coo_matrix]`	PC networks (sparse) used to seed propagation.	required
`X_df`	`DataFrame`	Expression DataFrame (genes x cells).	required
`ko_gene_id`	`Sequence[int]`	Index of the gene to knock out.	required
`degree`	`int`	Propagation depth passed to :func:`ko_propagation`.	`1`
`**kwargs`		Forwarded to :func:`scTenifold.core._networks.make_networks`.	`{}`

Returns:

Type	Description
`List of post-knockout PC networks.`

Source code in scTenifold/core/_ko.py

def reconstruct_pcnets(nets: List[coo_matrix],
                       X_df: pd.DataFrame,
                       ko_gene_id: Sequence[int],
                       degree: int = 1,
                       **kwargs) -> List[np.ndarray]:
    """Rebuild PC networks from knocked-out expression for each input net.

    Parameters
    ----------
    nets
        PC networks (sparse) used to seed propagation.
    X_df
        Expression DataFrame (genes x cells).
    ko_gene_id
        Index of the gene to knock out.
    degree
        Propagation depth passed to :func:`ko_propagation`.
    **kwargs
        Forwarded to :func:`scTenifold.core._networks.make_networks`.

    Returns
    -------
    List of post-knockout PC networks.
    """
    ko_nets = []
    network_kws = dict(kwargs)
    network_kws["n_nets"] = 1
    for net in nets:
        data = ko_propagation(net.toarray(), X_df.values, ko_gene_id, degree)
        data = pd.DataFrame(data, index=X_df.index, columns=X_df.columns)
        ko_net = make_networks(data, **network_kws)[0]
        ko_nets.append(ko_net)
    return ko_nets