In [1]:
cd ../executible/
In [2]:
%run Cu_transition_functionalized.py
In [3]:
df1_raw_FM40 = raw_data_cleanup("5G_counts.tsv")
columns = ['5GB1_FM40_T0m_TR2', '5GB1_FM40_T10m_TR3', '5GB1_FM40_T20m_TR2', '5GB1_FM40_T40m_TR1',
'5GB1_FM40_T60m_TR1', '5GB1_FM40_T90m_TR2', '5GB1_FM40_T150m_TR1_remake', '5GB1_FM40_T180m_TR1']
df2_TPM = TPM_counts(df1_raw_FM40, "start_coord", "end_coord",columns, remove_zero = True) #TPM counts
df2_TPM_log2 = log_2_transform(df2_TPM, "5GB1_FM40_T0m_TR2","5GB1_FM40_T180m_TR1") #TPM log 2 transformed
df2_TPM_mean = mean_center(df2_TPM, "5GB1_FM40_T0m_TR2","5GB1_FM40_T180m_TR1") #TPM mean centered
df3_pearson_r = congruency_table(df2_TPM, "5GB1_FM40_T0m_TR2" , "5GB1_FM40_T180m_TR1", step = df2_TPM.shape[0])
df3_euclidean_mean = euclidean_distance(df2_TPM_mean, "5GB1_FM40_T0m_TR2" , "5GB1_FM40_T180m_TR1")
df3_euclidean_log2 = euclidean_distance(df2_TPM_mean, "5GB1_FM40_T0m_TR2" , "5GB1_FM40_T180m_TR1" )
print("The shape of the TPM table is ", df2_TPM.shape)
print("The shape of the pearson_r matrix is ", df3_pearson_r.shape)
In [4]:
%matplotlib inline
In [5]:
df2_TPM_values = df2_TPM.loc[:,"5GB1_FM40_T0m_TR2":"5GB1_FM40_T180m_TR1"]
df2_TPM_values
Out[5]:
In [6]:
df2_TPM_values.describe()
Out[6]:
In [121]:
df2_TPM.idxmax?
In [138]:
index = df2_TPM_values.sort("5GB1_FM40_T0m_TR2", ascending = False).index.tolist()
top_expressed = df2_TPM.loc[index].iloc[:20,:]
top_expressed.to_csv("top_expressed.csv")
top_expressed
#note: looking with mitch at gene MCBURv2_200002 - this is low expressed in all other transcriptomcis data except for
#fm81 where we added extra NO3 and its within the top 15 top expressed genes
#gene 20471 is top expressed (within top 15 genes across all transcripomics samples that we have)
Out[138]:
In [10]:
df2_TPM_values.plot.hist(bins=100)
Out[10]:
In [11]:
import matplotlib.pyplot as plt
plt.style.use("seaborn-white")
In [ ]:
In [12]:
"""
columns = ['5GB1_FM40_T0m_TR2', '5GB1_FM40_T10m_TR3', '5GB1_FM40_T20m_TR2', '5GB1_FM40_T40m_TR1',
'5GB1_FM40_T60m_TR1', '5GB1_FM40_T90m_TR2', '5GB1_FM40_T150m_TR1_remake', '5GB1_FM40_T180m_TR1']
"""
x1 = df2_TPM_values["5GB1_FM40_T0m_TR2"]
x2 = df2_TPM_values['5GB1_FM40_T10m_TR3']
x3 = df2_TPM_values['5GB1_FM40_T20m_TR2']
x4 = df2_TPM_values['5GB1_FM40_T40m_TR1']
x5 = df2_TPM_values['5GB1_FM40_T60m_TR1']
x6 = df2_TPM_values['5GB1_FM40_T90m_TR2']
x7 = df2_TPM_values['5GB1_FM40_T150m_TR1_remake']
x8 = df2_TPM_values['5GB1_FM40_T180m_TR1']
kwargs = dict(histtype = "stepfilled", alpha = 0.1, normed = True, bins = 1000)
axes = plt.gca()
axes.set_xlim([0,5000])
axes.set_ylim([0,0.001])
plt.hist(x1, **kwargs)
plt.hist(x2, **kwargs)
plt.hist(x3, **kwargs)
plt.hist(x4, **kwargs)
plt.hist(x5, **kwargs)
plt.hist(x6, **kwargs)
plt.hist(x7, **kwargs)
plt.hist(x8, **kwargs)
Out[12]:
In [13]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
In [14]:
#will start with standard scalar - need to scale along rows, therefore transposing the data to complete standard scale.
df2_TPM_values_T = df2_TPM_values.T #transposing the data
standard_scaler = StandardScaler()
array_stand_scale = standard_scaler.fit_transform(df2_TPM_values_T)
df2a_stand_scale = pd.DataFrame(array_stand_scale)
df2a_stand_scale = df2a_stand_scale.T
df2a_stand_scale.columns = df2_TPM_values.columns
df2a_stand_scale.index = df2_TPM_values.index
In [15]:
df2a_stand_scale.describe()
Out[15]:
In [16]:
"""
columns = ['5GB1_FM40_T0m_TR2', '5GB1_FM40_T10m_TR3', '5GB1_FM40_T20m_TR2', '5GB1_FM40_T40m_TR1',
'5GB1_FM40_T60m_TR1', '5GB1_FM40_T90m_TR2', '5GB1_FM40_T150m_TR1_remake', '5GB1_FM40_T180m_TR1']
"""
x1 = df2a_stand_scale["5GB1_FM40_T0m_TR2"]
x2 = df2a_stand_scale['5GB1_FM40_T10m_TR3']
x3 = df2a_stand_scale['5GB1_FM40_T20m_TR2']
x4 = df2a_stand_scale['5GB1_FM40_T40m_TR1']
x5 = df2a_stand_scale['5GB1_FM40_T60m_TR1']
x6 = df2a_stand_scale['5GB1_FM40_T90m_TR2']
x7 = df2a_stand_scale['5GB1_FM40_T150m_TR1_remake']
x8 = df2a_stand_scale['5GB1_FM40_T180m_TR1']
kwargs = dict(histtype = "stepfilled", alpha = 0.1, normed = True, bins = 1000)
"""
axes = plt.gca()
axes.set_xlim([0,5000])
axes.set_ylim([0,0.001])
"""
plt.hist(x1, **kwargs)
#plt.hist(x2, **kwargs)
#plt.hist(x3, **kwargs)
#plt.hist(x4, **kwargs)
#plt.hist(x5, **kwargs)
#plt.hist(x6, **kwargs)
#plt.hist(x7, **kwargs)
#plt.hist(x8, **kwargs)
Out[16]:
Would like to plot every standard scaled histogram by itself. Found a function from stack exchagne about making histograms for every column
In [17]:
def draw_histograms(df, variables, n_rows, n_cols):
fig=plt.figure(figsize = (20,10))
for i, var_name in enumerate(variables):
ax=fig.add_subplot(n_rows,n_cols,i+1)
df[var_name].hist(ax=ax, histtype = "stepfilled", alpha = 0.3, normed = True, bins = 1000)
ax.set_title(var_name)
plt.show()
testa = df2a_stand_scale
draw_histograms(testa, testa.columns, 2, 4)
In [18]:
df2_TPM_values_T = df2_TPM_values.T #transposing the data
min_max_scalar = MinMaxScaler()
df2b_mean_max_rows = min_max_scalar.fit_transform(df2_TPM_values_T)
df2b_mean_max_rows = pd.DataFrame(df2b_mean_max_rows.T)
df2b_mean_max_rows.columns = df2_TPM_values.columns
df2b_mean_max_rows.index = df2_TPM_values.index
In [19]:
testb = df2b_mean_max_rows
draw_histograms(testb, testb.columns, 2, 4)
In [20]:
min_max_scalar = MinMaxScaler()
df2c_mean_max_columns = min_max_scalar.fit_transform(df2_TPM_values)
df2c_mean_max_columns = pd.DataFrame(df2c_mean_max_columns)
df2c_mean_max_columns.columns = df2_TPM_values.columns
df2c_mean_max_columns.index = df2_TPM_values.index
In [21]:
def draw_histograms(df, variables, n_rows, n_cols):
fig=plt.figure(figsize = (20,10))
for i, var_name in enumerate(variables):
ax=fig.add_subplot(n_rows,n_cols,i+1)
df[var_name].hist(ax=ax, histtype = "stepfilled", alpha = 0.3, normed = True, bins = 100)
ax.set_title(var_name)
ax.set_ylim([0, 5])
plt.show()
testc = df2c_mean_max_columns
draw_histograms(testc, testc.columns, 2, 4)
In [22]:
from sklearn.decomposition import PCA
In [23]:
pca = PCA(n_components=2)
pca.fit(df2c_mean_max_columns)
print(pca.explained_variance_ratio_) #how much of the dimensionality reduction is
pcat = pca.transform(df2c_mean_max_columns)
plt.figure()
plt.scatter(pcat[:,0], pcat[:,1])
Out[23]:
In [24]:
pca = PCA(n_components=2)
pca.fit(df2c_mean_max_columns)
print(pca.explained_variance_ratio_) #how much of the dimensionality reduction is
pcat = pca.transform(df2c_mean_max_columns)
plt.figure()
plt.xlim(-.1, .4)
plt.ylim(-.2, .2)
plt.scatter(pcat[:,0], pcat[:,1])
Out[24]:
In [25]:
pca = PCA(n_components=2)
pca.fit(df2a_stand_scale)
print(pca.explained_variance_ratio_) #how much of the dimensionality reduction is
pcat = pca.transform(df2a_stand_scale)
#plt.figure()
plt.scatter(pcat[:,0], pcat[:,1])
Out[25]:
In [26]:
#!/usr/bin/env python3
import math
import sys
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
df = df2_TPM_values
#print(df.head())
#dfr = df / df.loc("5GB1_FM40_T0m_TR2")
#print(dfr.head())
#sys.exit()
# PCA and clustering of dfr
#df.plot.hist(bins=100)
#plt.savefig("hist.pdf")
#X = StandardScaler().fit_transform(df)
X = MinMaxScaler().fit_transform(df)
"""
dfX = pd.DataFrame(X)
dfX.plot.hist(bins=100)
print(dfX.head())
plt.savefig("histX.pdf")
pca = PCA(n_components=2)
pca.fit(X)
print(pca.explained_variance_ratio_)
pcat = pca.transform(X)
plt.figure()
#plt.xlim(-5, 5)
#plt.ylim(-5, 5)
plt.scatter(pcat[:,0], pcat[:,1])
plt.show()
plt.savefig("pca.pdf")
"""
for eps_value in [0.0001, 0.005, 0.01, 0.05, 0.075, 0.1]:
for ms in [3, 5]:
db = DBSCAN(eps=eps_value, min_samples=ms).fit(X)
print(set(db.labels_))
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('eps = %f - min_samples = %d - number of clusters: %d' % (eps_value, ms, n_clusters_))
What do the 14 clusters look like?
In [27]:
X = MinMaxScaler().fit_transform(df2_TPM_values)
db_TPM_values = DBSCAN(eps=0.0001, min_samples=3).fit(X)
labels_TPM_values = db_TPM_values.labels_
print(np.unique(labels_TPM_values))
print(np.bincount(labels_TPM_values[labels_TPM_values!=-1]))
In [156]:
labels_TPM_values
Out[156]:
In [140]:
#!/usr/bin/env python3
import math
import sys
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
df = df2a_stand_scale
#print(df.head())
#dfr = df / df.loc("5GB1_FM40_T0m_TR2")
#print(dfr.head())
#sys.exit()
# PCA and clustering of dfr
#df.plot.hist(bins=100)
#plt.savefig("hist.pdf")
#X = StandardScaler().fit_transform(df)
#X = MinMaxScaler().fit_transform(df)
"""
dfX = pd.DataFrame(X)
dfX.plot.hist(bins=100)
print(dfX.head())
plt.savefig("histX.pdf")
pca = PCA(n_components=2)
pca.fit(X)
print(pca.explained_variance_ratio_)
pcat = pca.transform(X)
plt.figure()
#plt.xlim(-5, 5)
#plt.ylim(-5, 5)
plt.scatter(pcat[:,0], pcat[:,1])
plt.show()
plt.savefig("pca.pdf")
"""
for eps_value in [0.0001, 0.005, 0.01, 0.05, 0.075, 0.1]:
for ms in [3, 5]:
db = DBSCAN(eps=eps_value, min_samples=ms).fit(df)
print(set(db.labels_))
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('eps = %f - min_samples = %d - number of clusters: %d' % (eps_value, ms, n_clusters_))
In [141]:
#!/usr/bin/env python3
import math
import sys
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
df = df2_TPM_values
#print(df.head())
#dfr = df / df.loc("5GB1_FM40_T0m_TR2")
#print(dfr.head())
#sys.exit()
# PCA and clustering of dfr
#df.plot.hist(bins=100)
#plt.savefig("hist.pdf")
X = StandardScaler().fit_transform(df)
#X = MinMaxScaler().fit_transform(df)
"""
dfX = pd.DataFrame(X)
dfX.plot.hist(bins=100)
print(dfX.head())
plt.savefig("histX.pdf")
pca = PCA(n_components=2)
pca.fit(X)
print(pca.explained_variance_ratio_)
pcat = pca.transform(X)
plt.figure()
#plt.xlim(-5, 5)
#plt.ylim(-5, 5)
plt.scatter(pcat[:,0], pcat[:,1])
plt.show()
plt.savefig("pca.pdf")
"""
for eps_value in [0.0001, 0.005, 0.01, 0.05, 0.075, 0.1]:
for ms in [3, 5]:
db = DBSCAN(eps=eps_value, min_samples=ms).fit(X)
print(set(db.labels_))
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('eps = %f - min_samples = %d - number of clusters: %d' % (eps_value, ms, n_clusters_))
In [29]:
df2_TPM_log2.columns
tp_zero = df2_TPM_log2["5GB1_FM40_T0m_TR2"]
df2_TPM_log2_diff = df2_TPM_log2.subtract(df2_TPM_log2["5GB1_FM40_T0m_TR2"], axis = "index")
df2_TPM_log2_diff = df2_TPM_log2_diff.loc[:,'5GB1_FM40_T10m_TR3':'5GB1_FM40_T180m_TR1']
df2_TPM_log2_diff.describe()
Out[29]:
In [30]:
def draw_histograms(df, variables, n_rows, n_cols):
fig=plt.figure(figsize = (20,10))
for i, var_name in enumerate(variables):
ax=fig.add_subplot(n_rows,n_cols,i+1)
df[var_name].hist(ax=ax, histtype = "stepfilled", alpha = 0.3, normed = True, bins = 1000)
ax.set_title(var_name)
plt.show()
testa = df2_TPM_log2_diff
draw_histograms(testa, testa.columns, 2, 4)
PCA this distribution
In [31]:
pca = PCA(n_components=2)
pca.fit(df2_TPM_log2_diff)
print(pca.explained_variance_ratio_) #how much of the dimensionality reduction is
pcat = pca.transform(df2_TPM_log2_diff)
#plt.figure()
plt.scatter(pcat[:,0], pcat[:,1])
Out[31]:
In [32]:
#!/usr/bin/env python3
import math
import sys
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
df = df2_TPM_log2_diff
#print(df.head())
#dfr = df / df.loc("5GB1_FM40_T0m_TR2")
#print(dfr.head())
#sys.exit()
# PCA and clustering of dfr
#df.plot.hist(bins=100)
#plt.savefig("hist.pdf")
#X = StandardScaler().fit_transform(df)
#X = MinMaxScaler().fit_transform(df)
X = df
"""
dfX = pd.DataFrame(X)
dfX.plot.hist(bins=100)
print(dfX.head())
plt.savefig("histX.pdf")
pca = PCA(n_components=2)
pca.fit(X)
print(pca.explained_variance_ratio_)
pcat = pca.transform(X)
plt.figure()
#plt.xlim(-5, 5)
#plt.ylim(-5, 5)
plt.scatter(pcat[:,0], pcat[:,1])
plt.show()
plt.savefig("pca.pdf")
"""
for eps_value in [0.0001, 0.005, 0.01, 0.05, 0.075, 0.1]:
for ms in [3, 5]:
db = DBSCAN(eps=eps_value, min_samples=ms).fit(X)
print(set(db.labels_))
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print(np.unique(labels))
print('eps = %f - min_samples = %d - number of clusters: %d' % (eps_value, ms, n_clusters_))
In [33]:
dbscan_log2diff = DBSCAN(eps=0.1, min_samples=3).fit(df2_TPM_log2_diff)
labels_log2diff = dbscan_log2diff.labels_
print(np.unique(labels_log2diff))
print(np.bincount(labels_log2diff[labels_log2diff!=-1]))
In [34]:
#cluster index
clusters_38 = {i: np.where(labels_log2diff == i)[0] for i in range(38)}
In [35]:
clusters_38
Out[35]:
In [144]:
pca = PCA(n_components=2)
pca.fit(df2_TPM_log2_diff)
print(pca.explained_variance_ratio_) #how much of the dimensionality reduction is
pcat = pca.transform(df2_TPM_log2_diff)
pcat = pd.DataFrame(pcat)
pcat[2] = labels_log2diff
#plt.figure()
#plt.scatter(pcat[:,0], pcat[:,1])
In [151]:
pcat[2].max()
Out[151]:
In [154]:
def dfScatter(df, xcol=0, ycol=1, catcol=2):
fig, ax = plt.subplots()
categories = np.unique(df[catcol])
colors = np.linspace(0, 1, len(categories))
colordict = dict(zip(categories, colors))
df["Color"] = df[catcol].apply(lambda x: colordict[x])
ax.scatter(df[xcol], df[ycol], c = df.Color)
plt.xlim(-1, 1)
plt.ylim(-1, 1)
return fig
In [155]:
dfScatter(pcat)
Out[155]:
In [104]:
pcat2 = pcat
In [105]:
pcat2.iloc[0:1000,2] = 0
#pcat2.iloc[1000:3000,2] = 1
#pcat2.iloc[3000:,2]= 2
In [ ]:
In [ ]:
In [ ]:
#!/usr/bin/env python3
import math
import sys
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
df = df2_TPM_values
print(df.head())
#dfr = df / df.loc("5GB1_FM40_T0m_TR2")
#print(dfr.head())
#sys.exit()
# PCA and clustering of dfr
df.plot.hist(bins=100)
plt.savefig("hist.pdf")
#X = StandardScaler().fit_transform(df)
X = MinMaxScaler().fit_transform(df)
dfX = pd.DataFrame(X)
dfX.plot.hist(bins=100)
print(dfX.head())
plt.savefig("histX.pdf")
pca = PCA(n_components=2)
pca.fit(X)
print(pca.explained_variance_ratio_)
pcat = pca.transform(X)
plt.figure()
#plt.xlim(-5, 5)
#plt.ylim(-5, 5)
plt.scatter(pcat[:,0], pcat[:,1])
plt.show()
plt.savefig("pca.pdf")
for eps in [0.0001, 0.005, 0.01, 0.05, 0.075, 0.1]:
for ms in [3, 5]:
db = DBSCAN(eps=0.3, min_samples=3).fit(X)
print(set(db.labels_))
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('eps = %f - min_samples = %d - number of clusters: %d' % (eps, ms, n_clusters_))
In [ ]:
#!/usr/bin/env python3
import math
import sys
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
df = df2_TPM_values
#print(df.head())
#dfr = df / df.loc("5GB1_FM40_T0m_TR2")
#print(dfr.head())
#sys.exit()
# PCA and clustering of dfr
#df.plot.hist(bins=100)
#plt.savefig("hist.pdf")
#X = StandardScaler().fit_transform(df)
X = MinMaxScaler().fit_transform(df)
"""
dfX = pd.DataFrame(X)
dfX.plot.hist(bins=100)
print(dfX.head())
plt.savefig("histX.pdf")
pca = PCA(n_components=2)
pca.fit(X)
print(pca.explained_variance_ratio_)
pcat = pca.transform(X)
plt.figure()
#plt.xlim(-5, 5)
#plt.ylim(-5, 5)
plt.scatter(pcat[:,0], pcat[:,1])
plt.show()
plt.savefig("pca.pdf")
"""
for eps_value in [0.0001, 0.005, 0.01, 0.05, 0.075, 0.1]:
for ms in [3, 5]:
db = DBSCAN(eps=eps_value, min_samples=ms).fit(array_stand_scale)
print(set(db.labels_))
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('eps = %f - min_samples = %d - number of clusters: %d' % (eps_value, ms, n_clusters_))