In [1]:
import matplotlib
%matplotlib inline
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
sns.set_context('poster')
sns.set_style('white')
pd.options.mode.chained_assignment = None # default='warn'
import warnings
warnings.simplefilter('ignore')
# for GMM
import itertools
from scipy import linalg
import matplotlib as mpl
from sklearn import mixture
import math
import matplotlib.mlab as mlab
In [2]:
def filter_by_coverage(cur_r, bad_percent, bad_samples):
def filter_row(row):
num_of_samples = len(row)
valid = np.sum(np.array(([(min_coverage < row) & (row < max_coverage)])))
return num_of_samples - valid <= bad_samples
min_coverage = np.percentile(cur_r, bad_percent, axis=0)
max_coverage = np.percentile(cur_r, 100-bad_percent, axis=0)
good_coverage = np.array([filter_row(row) for row in cur_r])
return good_coverage
In [3]:
r = np.genfromtxt("infant_gut/infant_gut_pure_without_ref/matrices/R_all", dtype=int, delimiter=' ')
x = np.genfromtxt("infant_gut/infant_gut_pure_without_ref/matrices/X_all", dtype=int, delimiter=' ')
names = ["strain 1", "strain 3", "strain 4"]
num_of_strains = len(names)
print("%s sites" % len(r))
In [4]:
mask = x[:, 0:(num_of_strains-1)]
mask[mask > 0] = 1
r = np.delete(r, [i for i in range(num_of_strains)], axis=1)
x = np.delete(x, [i for i in range(num_of_strains)], axis=1)
In [5]:
Ncut = 6
print("Delete zero and almost zero profiles:")
good_ind = [i for i in range(np.shape(x)[0])
if not ((np.abs(r[i, :] - x[i, :]) <= Ncut).all() or (x[i, :] <= Ncut).all())]
print(len(good_ind), "remained")
x = x[good_ind, :]
r = r[good_ind, :]
mask = mask[good_ind, :]
In [6]:
f = np.divide(x, r)
good_coverage = filter_by_coverage(r, 20, 2)
mask_filtered = mask[good_coverage, :]
f_filtered = f[good_coverage, :]
print(len(f_filtered), "remained")
genotypes = f_filtered.T
In [7]:
strain1 = list(map(float, "0.73 0.74 0.04 0.13 0.17 0.04 0.32 0.75 0.30 0.20 0.0".split()))
strain3 = list(map(float, "0.24 0.20 0.95 0.80 0.80 0.93 0.52 0.19 0.64 0.65 1.0".split()))
strain4 = list(map(float, "0.03 0.06 0.02 0.07 0.03 0.02 0.16 0.06 0.06 0.15 0.0".split()))
num_samples = len(strain1)
real_freqs = np.array((strain1, strain3, strain4))
for i, f in zip(range(1, num_samples+1), np.max(real_freqs, axis = 0)):
print(i, f)
In [8]:
print(real_freqs)
In [202]:
print(real_freqs[:,0])
In [204]:
f, axarr = plt.subplots(len(genotypes), sharex=True)
for i in range(len(genotypes)):
axarr[i].hist(genotypes[i], bins=100);
axarr[i].axis('off')
axarr[i].set_title(i+1, x=-0.05, y=0.2)
axarr[i].text(1.1, 0.65, [float('{:.2f}'.format(m)) for m in sorted(real_freqs[:,i])])
#axarr[i].set_xlim((0.03, 1.))
f.set_figheight(30)
f.set_figwidth(10)
In [118]:
X = genotypes[8].reshape((len(genotypes[8]), 1))
In [284]:
def plot_model_2(sample, covariance_type):
X = genotypes[sample].reshape((len(genotypes[sample]), 1))
model = mixture.BayesianGaussianMixture(n_components=20,
covariance_type=covariance_type,
tol=0.001,
reg_covar=1e-06,
max_iter=1000,
n_init=1,
init_params='kmeans',
weight_concentration_prior_type='dirichlet_process',
weight_concentration_prior=None,
mean_precision_prior=None,
mean_prior=None,
degrees_of_freedom_prior=None,
covariance_prior=None,
random_state=None,
warm_start=False,
verbose=0, verbose_interval=10)
model.fit(X)
Y_ = set(model.predict(X))
model.means_ = model.means_.flatten()
model.covariances_ = model.covariances_.flatten()
weights = np.ones_like(X)/float(len(X))
plt.hist(X, 40, weights=weights, alpha=0.4);
mus = []
ws = []
for i in Y_:
#if model.weights_[i] < 0.02:
# print("one more component")
# continue
mu = model.means_[i]
mus.append(mu)
ws.append(model.weights_[i])
variance = model.covariances_[i]
sigma = math.sqrt(variance)
x = np.linspace(mu - 3*sigma, mu + 3*sigma, 100)
pdf = mlab.normpdf(x, mu, sigma)
plt.plot(x,((pdf / np.max(pdf))* model.weights_[i])/10)
plt.xlabel("SNV frequencies")
print([float('{:.2f}'.format(m)) for m in sorted(mus)])
#print([float('{:.4f}'.format(w)) for w in sorted(ws)])
plot_model_2(3, 'spherical')
In [287]:
num_of_samples = 11
f, axarr = plt.subplots(num_of_samples, sharex=True)
for sample in range(num_of_samples):
X = genotypes[sample].reshape((len(genotypes[sample]), 1))
model = mixture.BayesianGaussianMixture(n_components=20,
covariance_type='spherical',
tol=0.001,
reg_covar=1e-06,
max_iter=1000,
n_init=3,
init_params='kmeans',
weight_concentration_prior_type='dirichlet_process',
weight_concentration_prior=None,
mean_precision_prior=None,
mean_prior=None,
degrees_of_freedom_prior=None,
covariance_prior=None,
random_state=None,
warm_start=False,
verbose=0, verbose_interval=10)
model.fit(X)
Y_ = set(model.predict(X))
model.means_ = model.means_.flatten()
model.covariances_ = model.covariances_.flatten()
weights = np.ones_like(X)/float(len(X))
axarr[sample].hist(X, 40, weights=weights, alpha=0.4);
mus = []
ws = []
for i in Y_:
#if model.weights_[i] < 0.02:
# print("one more component")
# continue
mu = model.means_[i]
mus.append(mu)
ws.append(model.weights_[i])
variance = model.covariances_[i]
sigma = math.sqrt(variance)
x = np.linspace(mu - 3*sigma, mu + 3*sigma, 100)
pdf = mlab.normpdf(x, mu, sigma)
axarr[sample].plot(x,((pdf / np.max(pdf))* model.weights_[i])/10)
axarr[sample].text(1.3, 0.0,
'Predicted: ' + str([float('{:.2f}'.format(m)) for m in sorted(mus)]) +
'\n' +
' Real: ' + str([float('{:.2f}'.format(m)) for m in sorted(real_freqs[:,sample])]))
axarr[sample].set_xlim((-0.1, 1.1))
f.set_figheight(5 * num_of_samples)
f.set_figwidth(10)
In [296]:
a = genotypes[0]
((0.4 < a) & (a < 0.6)).sum() / len(a)
Out[296]:
In [307]:
num_of_samples = 11
f, axarr = plt.subplots(num_of_samples, sharex=True)
eps = 0.01
for sample in range(num_of_samples):
X = genotypes[sample]
margin = eps
while ((0.5-margin < X) & (X < 0.5+margin)).sum() / len(X) < 0.05:
margin += eps
if 0.5 + margin >= 0.7:
d_str = 'DOMINATED\n'
color = 'red'
else:
color = 'blue'
d_str = ''
weights = np.ones_like(X)/float(len(X))
axarr[sample].hist(X, 40, weights=weights, alpha=0.4, color=color);
axarr[sample].text(1.3, 0.0, d_str +
'Margin: ' + str(round(0.5-margin,2)) + ' -- ' + str(round(0.5+margin,2)) + '\n' +
'Real: ' + str([float('{:.2f}'.format(m)) for m in sorted(real_freqs[:,sample])]))
axarr[sample].set_xlim((-0.1, 1.1))
f.set_figheight(5 * num_of_samples)
f.set_figwidth(10)
In [ ]: