Skip to content

Pipeline Functions

Lower-level functions used by the workflow classes.

Quality Control and Normalization

sc_QC

sc_QC(
    X: DataFrame,
    min_lib_size: float = 1000,
    remove_outlier_cells: bool = True,
    min_percent: float = 0.05,
    max_mito_ratio: float = 0.1,
    min_exp_avg: float = 0,
    min_exp_sum: float = 0,
) -> pd.DataFrame

main QC function in scTenifold pipelines

Parameters:

Name Type Description Default
X DataFrame

A single-cell RNAseq DataFrame (rows: genes, cols: cells)

required
min_lib_size float

Minimum library size of cells

1000
remove_outlier_cells bool

Whether the QC function will remove the outlier cells

True
min_percent float

Minimum fraction of cells where the gene needs to be expressed to be included in the analysis.

0.05
max_mito_ratio float

Maximum mitochondrial genes ratio included in the final df

0.1
min_exp_avg float

Minimum average expression value in each gene

0
min_exp_sum float

Minimum sum of expression value in each gene

0

Returns:

Name Type Description
X_modified DataFrame

The DataFrame after QC

Source code in scTenifold/core/_QC.py
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def sc_QC(X: pd.DataFrame,
          min_lib_size: float = 1000,
          remove_outlier_cells: bool = True,
          min_percent: float = 0.05,
          max_mito_ratio: float = 0.1,
          min_exp_avg: float = 0,
          min_exp_sum: float = 0) -> pd.DataFrame:
    """
    main QC function in scTenifold pipelines

    Parameters
    ----------
    X: pd.DataFrame
        A single-cell RNAseq DataFrame (rows: genes, cols: cells)
    min_lib_size: int, float, default = 1000
        Minimum library size of cells
    remove_outlier_cells: bool, default = True
        Whether the QC function will remove the outlier cells
    min_percent: float, default = 0.05
        Minimum fraction of cells where the gene needs to be expressed to be included in the analysis.
    max_mito_ratio: float, default = 0.1
        Maximum mitochondrial genes ratio included in the final df
    min_exp_avg: float, default = 0
        Minimum average expression value in each gene
    min_exp_sum: float, default = 0
        Minimum sum of expression value in each gene
    Returns
    -------
    X_modified: pd.DataFrame
        The DataFrame after QC
    """
    X = X.copy()
    outlier_coef = 1.5
    X[X < 0] = 0
    lib_size = X.sum(axis=0)
    before_s = X.shape[1]
    X = X.loc[:, lib_size > min_lib_size]
    print(f"Removed {before_s - X.shape[1]} cells with lib size < {min_lib_size}")
    if remove_outlier_cells:
        lib_size = X.sum(axis=0)
        before_s = X.shape[1]
        Q1, Q3 = lib_size.quantile([0.25, 0.75])
        interquartile_range = Q3 - Q1
        X = X.loc[:, (lib_size >= Q1 - interquartile_range * outlier_coef) &
                     (lib_size <= Q3 + interquartile_range * outlier_coef)]
        print(f"Removed {before_s - X.shape[1]} outlier cells from original data")
    mt_genes = X.index.str.upper().str.match("^MT-")
    if any(mt_genes):
        print(f"Found mitochondrial genes: {X[mt_genes].index.to_list()}")
        before_s = X.shape[1]
        mt_rates = X[mt_genes].sum(axis=0) / X.sum(axis=0)
        X = X.loc[:, mt_rates < max_mito_ratio]
        print(f"Removed {before_s - X.shape[1]} samples from original data (mt genes ratio > {max_mito_ratio})")
    else:
        warn("Mitochondrial genes were not found. Be aware that apoptotic cells may be present in your sample.")
    before_g = X.shape[0]
    X = X[(X != 0).mean(axis=1) > min_percent]
    print(f"Removed {before_g - X.shape[0]} genes expressed in less than {min_percent} of data")

    before_g = X.shape[0]
    if X.shape[1] > 500:
        X = X.loc[X.mean(axis=1) >= min_exp_avg, :]
    else:
        X = X.loc[X.sum(axis=1) >= min_exp_sum, :]
    print(f"Removed {before_g - X.shape[0]} genes with expression values: average < {min_exp_avg} or sum < {min_exp_sum}")
    return X

cpm_norm

cpm_norm(
    X: Union[ndarray, DataFrame],
) -> Union[np.ndarray, pd.DataFrame]

Counts-per-million normalize a genes-by-cells matrix.

Parameters:

Name Type Description Default
X Union[ndarray, DataFrame]

Genes-by-cells count matrix.

required

Returns:

Type Description
Normalized matrix with the same type and shape as ``X``.
Source code in scTenifold/core/_norm.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def cpm_norm(X: Union[np.ndarray, pd.DataFrame]) -> Union[np.ndarray, pd.DataFrame]:
    """Counts-per-million normalize a genes-by-cells matrix.

    Parameters
    ----------
    X
        Genes-by-cells count matrix.

    Returns
    -------
    Normalized matrix with the same type and shape as ``X``.
    """
    lib_size = X.sum(axis=0)
    safe_lib_size = lib_size.replace(0, np.nan) if isinstance(lib_size, pd.Series) else np.where(lib_size == 0, np.nan, lib_size)
    normalized = X * 1e6 / safe_lib_size
    if isinstance(normalized, pd.DataFrame):
        return normalized.fillna(0)
    return np.nan_to_num(normalized)

Tensor Decomposition

tensor_decomp

tensor_decomp(
    networks: ndarray,
    gene_names: Sequence[str],
    method: str = "parafac",
    n_decimal: int = 1,
    K: int = 5,
    tol: float = 1e-06,
    max_iter: int = 1000,
    random_state: int = 42,
    **kwargs,
) -> pd.DataFrame

Perform tensor decomposition on pc networks

Parameters:

Name Type Description Default
networks ndarray

Concatenated network, expected shape = (n_genes, n_genes, n_pcnets)

required
gene_names Sequence[str]

The name of each gene in the network (order matters)

required
method str

Tensor decomposition method, tensorly's decomposition method was used: http://tensorly.org/stable/modules/api.html#module-tensorly.decomposition

'parafac'
n_decimal int

Number of decimal in the final df

1
K int

Rank in parafac function

5
tol float

Tolerance in the iteration

1e-06
max_iter int

Number of interation

1000
random_state int

Random seed used to reproduce the same result

42
**kwargs

Keyword arguments used in the decomposition function

{}

Returns:

Name Type Description
tensor_decomp_df DataFrame

The result of tensor decomposition, expected shape = (n_genes, n_genes)

References

http://tensorly.org/stable/modules/api.html#module-tensorly.decomposition

Source code in scTenifold/core/_decomposition.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
@timer
def tensor_decomp(networks: np.ndarray,
                  gene_names: Sequence[str],
                  method: str = "parafac",
                  n_decimal: int = 1,
                  K: int = 5,
                  tol: float = 1e-6,
                  max_iter: int = 1000,
                  random_state: int = 42,
                  **kwargs) -> pd.DataFrame:
    """
    Perform tensor decomposition on pc networks

    Parameters
    ----------
    networks: np.ndarray
        Concatenated network, expected shape = (n_genes, n_genes, n_pcnets)
    gene_names: sequence of str
        The name of each gene in the network (order matters)
    method: str, default = 'parafac'
        Tensor decomposition method, tensorly's decomposition method was used:
        http://tensorly.org/stable/modules/api.html#module-tensorly.decomposition
    n_decimal: int
        Number of decimal in the final df
    K: int
        Rank in parafac function
    tol: float
        Tolerance in the iteration
    max_iter: int
        Number of interation
    random_state: int
        Random seed used to reproduce the same result
    **kwargs:
        Keyword arguments used in the decomposition function

    Returns
    -------
    tensor_decomp_df: pd.DataFrame
        The result of tensor decomposition, expected shape = (n_genes, n_genes)

    References
    ----------
    http://tensorly.org/stable/modules/api.html#module-tensorly.decomposition

    """
    # Us, est, res_hist = cp_als(networks, n_components=K, max_iter=max_iter, tol=tol)
    # print(est.shape, len(Us), res_hist)
    print("Using tensorly")
    factors = getattr(decomposition, method)(networks, rank=K, n_iter_max=max_iter, tol=tol,
                                             random_state=random_state, **kwargs)
    estimate = tl.cp_to_tensor(factors)
    print(estimate.shape)
    out = np.sum(estimate, axis=-1) / networks.shape[-1]
    out = np.round(out / np.max(abs(out)), n_decimal)
    return pd.DataFrame(out, index=gene_names, columns=gene_names)

Knockout Internals

ko_propagation

ko_propagation(
    B: ndarray,
    x: ndarray,
    ko_gene_id: Sequence[int],
    degree: int = 1,
) -> np.ndarray

Propagate a gene knockout through an adjacency matrix.

Parameters:

Name Type Description Default
B ndarray

Adjacency matrix (genes x genes); diagonal is zeroed in-place on a copy.

required
x ndarray

Expression matrix (genes x cells).

required
ko_gene_id Sequence[int]

Index of the gene to knock out.

required
degree int

Maximum propagation depth.

1

Returns:

Type Description
Knocked-out expression matrix (non-negative).
Source code in scTenifold/core/_ko.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def ko_propagation(B: np.ndarray,
                   x: np.ndarray,
                   ko_gene_id: Sequence[int],
                   degree: int = 1) -> np.ndarray:
    """Propagate a gene knockout through an adjacency matrix.

    Parameters
    ----------
    B
        Adjacency matrix (genes x genes); diagonal is zeroed in-place on a copy.
    x
        Expression matrix (genes x cells).
    ko_gene_id
        Index of the gene to knock out.
    degree
        Maximum propagation depth.

    Returns
    -------
    Knocked-out expression matrix (non-negative).
    """
    adj_mat = B.copy()
    np.fill_diagonal(adj_mat, 0)
    x_ko = x.copy()
    perturb = np.zeros(shape=x.shape)
    perturb[ko_gene_id, :] = x[ko_gene_id, :]
    is_visited = np.zeros(x_ko.shape[0], dtype=bool)
    x_ko = x_ko - perturb
    for d in range(degree):
        if not is_visited.all():
            perturb = adj_mat @ perturb
            new_visited = (perturb != 0).any(axis=1)
            adj_mat[is_visited, :] = 0
            adj_mat[:, is_visited] = 0
            is_visited = is_visited | new_visited
            x_ko = x_ko - perturb
    return np.where(x_ko >= 0, x_ko, 0)

reconstruct_pcnets

reconstruct_pcnets(
    nets: List[coo_matrix],
    X_df: DataFrame,
    ko_gene_id: Sequence[int],
    degree: int = 1,
    **kwargs,
) -> List[np.ndarray]

Rebuild PC networks from knocked-out expression for each input net.

Parameters:

Name Type Description Default
nets List[coo_matrix]

PC networks (sparse) used to seed propagation.

required
X_df DataFrame

Expression DataFrame (genes x cells).

required
ko_gene_id Sequence[int]

Index of the gene to knock out.

required
degree int

Propagation depth passed to :func:ko_propagation.

1
**kwargs

Forwarded to :func:scTenifold.core._networks.make_networks.

{}

Returns:

Type Description
List of post-knockout PC networks.
Source code in scTenifold/core/_ko.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def reconstruct_pcnets(nets: List[coo_matrix],
                       X_df: pd.DataFrame,
                       ko_gene_id: Sequence[int],
                       degree: int = 1,
                       **kwargs) -> List[np.ndarray]:
    """Rebuild PC networks from knocked-out expression for each input net.

    Parameters
    ----------
    nets
        PC networks (sparse) used to seed propagation.
    X_df
        Expression DataFrame (genes x cells).
    ko_gene_id
        Index of the gene to knock out.
    degree
        Propagation depth passed to :func:`ko_propagation`.
    **kwargs
        Forwarded to :func:`scTenifold.core._networks.make_networks`.

    Returns
    -------
    List of post-knockout PC networks.
    """
    ko_nets = []
    network_kws = dict(kwargs)
    network_kws["n_nets"] = 1
    for net in nets:
        data = ko_propagation(net.toarray(), X_df.values, ko_gene_id, degree)
        data = pd.DataFrame(data, index=X_df.index, columns=X_df.columns)
        ko_net = make_networks(data, **network_kws)[0]
        ko_nets.append(ko_net)
    return ko_nets