-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscCell_count_normalizations.py
More file actions
90 lines (73 loc) · 3.67 KB
/
scCell_count_normalizations.py
File metadata and controls
90 lines (73 loc) · 3.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# Author: Cyrille L. Delley
#
#
#
#
import numpy as np
import matplotlib.pyplot as plt
from scipy import sparse, io
from scipy.stats.mstats import gmean
from scipy.linalg import helmert
from sklearn.decomposition import TruncatedSVD
import statsmodels.api as sm
import anndata as ad
def GLM_regression(andat, covariables:list, antibodies:list=[], components:int=1, offset:bool=False, force_zeros:bool=False):
"""Perform general linear model regression of each Ab vector with provided covariables.
currently supported covariables are a subset from: [IgG1','amplicon','ab_raw','ab_umi']
Covariables are SVD transformed and the first N components are retained (default 1)
currently uses linear regression on log transformed data and a Gaussian error model."""
if antibodies == []:
antibodies = list(andat.var.index)
# add pseudo count, log transform, normalize and column center the covariable matrix before SVD
covar = np.log(np.array(andat.obs[covariables])+1)
covar = covar / covar.max(axis=0)
U,s,Vt= np.linalg.svd(covar - covar.mean(axis=0), full_matrices=False)
fig, ax = plt.subplots(1,1,figsize=(4,4))
ax.plot(s, 'o')
plt.show()
# use left eigenvectors for regression, correlation is scale invariant
if offset:
factors = sm.add_constant(U[:,:components], prepend=True)
else:
factors = U[:,:components]
# if some Ab's have all zero counts or were not mesured, the regression cannot be performed on the raw matrix
# skip columns with all zero in regression
Ab_filter = andat.X.sum(axis=0) != 0
fit_val = np.zeros([len(U), len(antibodies)])
residuals = np.zeros([len(U), len(antibodies)])
params = np.zeros([len(antibodies),factors.shape[1]])
for i in range(len(antibodies)):
if Ab_filter[i]:
linear_model_result = sm.GLM(np.log(andat.X[:,i]+1), factors, family=sm.families.Gaussian()).fit()
fit_val[:,i] = linear_model_result.fittedvalues
residuals[:,i] = linear_model_result.resid_response
params[i] = linear_model_result.params
andat_new = ad.AnnData(X=residuals, obs=andat.obs, var=andat[:,antibodies].var)
if force_zeros:
zero_pos = andat.X == 0
for v in zero_pos:
andat_new[:,v].X == 1.
return andat_new
def compositional_transform(andat_raw, add_pseudocount:bool=False, features:list=[]):
"""calculated the three Aitchison geometry transforms for the Ab counts.
- alr uses the IgG1 counts as universal reference
- ilr contrasts are based on the SVD of clr
if add_pseudocount is set to true add 1 to prevent zero division, otherwise
cells with zero counts in the denominator create inf and have to be filtered
before downstream analysis, e.g.:
clr_filter = np.isfinite(clr).all(axis=1)
clr = clr[clr_filter,:]"""
if features == []:
features = list(andat_raw.var.index)
if add_pseudocount:
clr_data = np.log((1+andat_raw[:,features].X)/gmean(andat_raw[:,features].X+1, axis=1).reshape(-1,1))
alr_data = np.log((1+andat_raw[:,features].X)/(andat_raw[:,features].X[:,-1]+1).reshape(-1,1))
U,s,Vt = np.linalg.svd(clr_data, full_matrices=False)
ilr_data = np.dot(U*s,helmert(len(s)).T)
else:
clr_data = np.log(andat_raw[:,features].X/gmean(andat_raw[:,features].X, axis=1).reshape(-1,1))
alr_data = np.log(andat_raw[:,features].X/(andat_raw[:,features].X[:,-1].reshape(-1,1)))
finite_clr = np.isfinite(clr_data).all(axis=1)
U,s,Vt = np.linalg.svd(clr_data[finite_clr,:], full_matrices=False)
ilr_data = np.dot(U*s,helmert(len(s)).T)
return alr_data, clr_data, ilr_data