In [1]:
cd executible/
In [2]:
%run Cu_transition_functionalized.py
In [3]:
import hdbscan
import time
from sklearn import metrics
In [4]:
df1_raw_FM40 = raw_data_cleanup("5G_counts.tsv")
columns = ['5GB1_FM40_T0m_TR2', '5GB1_FM40_T10m_TR3', '5GB1_FM40_T20m_TR2', '5GB1_FM40_T40m_TR1',
'5GB1_FM40_T60m_TR1', '5GB1_FM40_T90m_TR2', '5GB1_FM40_T150m_TR1_remake', '5GB1_FM40_T180m_TR1']
df2_TPM = TPM_counts(df1_raw_FM40, "start_coord", "end_coord",columns, remove_zero = True) #TPM counts
df2_TPM_log2 = log_2_transform(df2_TPM, "5GB1_FM40_T0m_TR2","5GB1_FM40_T180m_TR1") #TPM log 2 transformed
df2_TPM_mean = mean_center(df2_TPM, "5GB1_FM40_T0m_TR2","5GB1_FM40_T180m_TR1") #TPM mean centered
df3_pearson_r = congruency_table(df2_TPM, "5GB1_FM40_T0m_TR2" , "5GB1_FM40_T180m_TR1", step = df2_TPM.shape[0])
df3_euclidean_mean = euclidean_distance(df2_TPM_mean, "5GB1_FM40_T0m_TR2" , "5GB1_FM40_T180m_TR1")
df3_euclidean_log2 = euclidean_distance(df2_TPM_mean, "5GB1_FM40_T0m_TR2" , "5GB1_FM40_T180m_TR1" )
print("The shape of the TPM table is ", df2_TPM.shape)
print("The shape of the pearson_r matrix is ", df3_pearson_r.shape)
In [5]:
# Clustering the pearsons_R with N/A vlaues removed
hdb_t1 = time.time()
hdb_pearson_r = hdbscan.HDBSCAN(metric = "precomputed", min_cluster_size=10).fit(df3_pearson_r)
hdb_pearson_r_labels = hdb_pearson_r.labels_
hdb_elapsed_time = time.time() - hdb_t1
print("time to cluster", hdb_elapsed_time)
In [6]:
print(np.unique(hdb_pearson_r_labels)) # unique bins, zero is noise
print(np.bincount(hdb_pearson_r_labels[hdb_pearson_r_labels!=-1]))
In [24]:
pearson_clusters = {i: np.where(hdb_pearson_r_labels == i)[0] for i in range(2)}
In [23]:
pearson_clusters
Out[23]:
In [25]:
#pd.set_option('display.height', 500) #These two commands allow for the display of max of 500 rows - exploring genes
#pd.set_option('display.max_rows', 500)
df2_TPM.iloc[pearson_clusters[1],:] #the genes that were clustered together [0,1]
Out[25]:
Looks like there are two clusters, some expression and zero expression across samples.
In [27]:
df3_euclidean_mean.hist()
Out[27]:
In [10]:
# Clustering the mean centered euclidean distance of TPM counts
hdb_t1 = time.time()
hdb_euclidean_mean = hdbscan.HDBSCAN(metric = "precomputed", min_cluster_size=10).fit(df3_euclidean_mean)
hdb_euclidean_mean_labels = hdb_euclidean_mean.labels_
hdb_elapsed_time = time.time() - hdb_t1
print("time to cluster", hdb_elapsed_time)
In [11]:
print(np.unique(hdb_euclidean_mean_labels))
print(np.bincount(hdb_euclidean_mean_labels[hdb_euclidean_mean_labels!=-1]))
In [12]:
euclidean_mean_clusters = {i: np.where(hdb_euclidean_mean_labels == i)[0] for i in range(2)}
df2_TPM.iloc[euclidean_mean_clusters[1],:]
Out[12]:
Looks like 2 clusters - both with zero expression.
looks like wether it is a numpy array or pandas dataframe, the result is the same. lets now try to get index of the clustered points.
In [28]:
df3_euclidean_log2
Out[28]:
In [13]:
# Clustering the log2 transformed euclidean distance of TPM counts
hdb_t1 = time.time()
hdb_euclidean_log2 = hdbscan.HDBSCAN(metric = "precomputed", min_cluster_size=10).fit(df3_euclidean_log2)
hdb_euclidean_log2_labels = hdb_euclidean_log2.labels_
hdb_elapsed_time = time.time() - hdb_t1
print("time to cluster", hdb_elapsed_time)
In [14]:
print(np.unique(hdb_euclidean_log2_labels))
print(np.bincount(hdb_euclidean_log2_labels[hdb_euclidean_log2_labels!=-1]))
In [15]:
euclidean_log2_clusters = {i: np.where(hdb_euclidean_log2_labels == i)[0] for i in range(2)}
df2_TPM.iloc[euclidean_log2_clusters[1],:]
Out[15]:
In [16]:
df2_TPM_values = df2_TPM.loc[:,"5GB1_FM40_T0m_TR2":"5GB1_FM40_T180m_TR1"] #isolating the data values
df2_TPM_values_T = df2_TPM_values.T #transposing the data
standard_scaler = StandardScaler()
TPM_counts_mean_centered = standard_scaler.fit_transform(df2_TPM_values_T) #mean centering the data
TPM_counts_mean_centered = pd.DataFrame(TPM_counts_mean_centered) #back to Dataframe
#transposing back to original form and reincerting indeces and columns
my_index = df2_TPM_values.index
my_columns = df2_TPM_values.columns
TPM_counts_mean_centered = TPM_counts_mean_centered.T
TPM_counts_mean_centered.set_index(my_index, inplace=True)
TPM_counts_mean_centered.columns = my_columns
In [17]:
# Clustering the pearsons_R with N/A vlaues removed
hdb_t1 = time.time()
hdb_euclidean = hdbscan.HDBSCAN(metric = "euclidean", min_cluster_size=5).fit(TPM_counts_mean_centered)
hdb_euclidean_labels = hdb_euclidean.labels_
hdb_elapsed_time = time.time() - hdb_t1
print("time to cluster", hdb_elapsed_time)
In [18]:
print(np.unique(hdb_euclidean_labels))
print(np.bincount(hdb_euclidean_labels[hdb_euclidean_labels!=-1]))
In [30]:
Euclidean_standard_scaled_clusters = {i: np.where(hdb_euclidean_labels == i)[0] for i in range(7)}
df2_TPM.iloc[Euclidean_standard_scaled_clusters[1],:]
Out[30]:
In [19]:
df2_TPM_log2_scale= df2_TPM_log2.T #transposing the data
standard_scaler = StandardScaler()
TPM_log2_mean_scaled = standard_scaler.fit_transform(df2_TPM_log2_scale) #mean centering the data
TPM_log2_mean_scaled = pd.DataFrame(TPM_log2_mean_scaled) #back to Dataframe
#transposing back to original form and reincerting indeces and columns
my_index = df2_TPM_values.index
my_columns = df2_TPM_values.columns
TPM_log2_mean_scaled = TPM_log2_mean_scaled.T
TPM_log2_mean_scaled.set_index(my_index, inplace=True)
TPM_log2_mean_scaled.columns = my_columns
In [20]:
# Clustering the pearsons_R with N/A vlaues removed
hdb_t1 = time.time()
hdb_log2_euclidean = hdbscan.HDBSCAN(metric = "euclidean", min_cluster_size=5).fit(TPM_log2_mean_scaled)
hdb_log2_euclidean = hdb_log2_euclidean.labels_
hdb_elapsed_time = time.time() - hdb_t1
print("time to cluster", hdb_elapsed_time)
In [21]:
print(np.unique(hdb_log2_euclidean))
print(np.bincount(hdb_log2_euclidean[hdb_log2_euclidean!=-1]))
In [ ]: