In [1]:
import numpy as np
import pandas as pd
from scipy.signal import argrelmax
import sys
sys.path.append('/usr/local/Cellar/python/2.7.13/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages')
import tensorflow as tf
import tflearn
import tsne
from sklearn.manifold import TSNE
import math
from scipy import cluster
import matplotlib.pyplot as plt
from sklearn.decomposition import ProjectedGradientNMF
%matplotlib inline
np.random.seed(0)
tf.set_random_seed(0)
from sklearn.metrics import classification_report
In [2]:
#Read the CSV file and put it into pandas data frame
df = pd.read_csv('Polymers Plus Analytes_SNV_Reduced_SGSmoothed.csv', index_col=0)
In [3]:
#Take a quick look at the data
df
Out[3]:
In [5]:
# How many points on each side to use for the comparison to consider comparator(n, n+x) to be True.
neighborhood = 5
for neighborhood in range(1,20):
local_maxima = []
for i in range(df.shape[0]):
row = np.array(df.iloc[[i]])
a = argrelmax(row[0], order = neighborhood)
local_maxima.append(len(a[0]))
print("The average of local maxima with neighborsize of: %d is %.3f"%(neighborhood,np.mean(local_maxima)))
In [5]:
def xavier_init(fan_in, fan_out, constant=1):
""" Xavier initialization of network weights"""
# https://stackoverflow.com/questions/33640581/how-to-do-xavier-initialization-on-tensorflow
low = -constant*np.sqrt(6.0/(fan_in + fan_out))
high = constant*np.sqrt(6.0/(fan_in + fan_out))
return tf.random_uniform((fan_in, fan_out),
minval=low, maxval=high,
dtype=tf.float32)
In [ ]:
In [6]:
data_set = []
for i in range(df.shape[0]):
row = np.array(df.iloc[[i]])
data_set.append(row[0])
In [7]:
#Number of neurons in each layer
layer1_neurons = int(math.pow(2,10))
layer2_neurons = int(math.pow(2,8))
layer3_neurons = int(math.pow(2,4))
layer4_neurons = int(math.pow(2,1))
print("Layer1: %d, Layer2: %d, Layer3: %d, Layer4: %d"%(layer1_neurons,layer2_neurons,layer3_neurons,layer4_neurons))
In [ ]:
#Initializing the DNN
tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.8)
# Building the encoder
encoder = tflearn.input_data(shape=[None, len(data_set[0])])
INPUT = encoder
encoder = tflearn.fully_connected(encoder, layer1_neurons, name = "en_1")
encoder = tflearn.fully_connected(encoder, layer2_neurons, name = "en_2")
encoder = tflearn.fully_connected(encoder, layer3_neurons, name = "en_3")
encoder = tflearn.fully_connected(encoder, layer4_neurons, name = "en_4")
HIDDEN_STATE = encoder
# Building the decoder
decoder = tflearn.fully_connected(encoder, layer3_neurons, name = "de_1")
decoder = tflearn.fully_connected(encoder, layer2_neurons, name = "de_2")
decoder = tflearn.fully_connected(decoder, layer1_neurons, name = "de_3")
decoder = tflearn.fully_connected(decoder, len(data_set[0]), name = "de_4")
OUTPUT = decoder
# Regression, with mean square error
net = tflearn.regression(decoder, optimizer='AdaDelta', learning_rate=0.001,loss='mean_square', metric=None)
# Training the auto encoder
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(data_set, data_set, n_epoch=10,validation_set=0.1, run_id="auto_encoder", batch_size=10)
#Encode an input X and return the middle layer of the AE
def encode (X):
if len (X.shape) < 2:
X = X.reshape (1, -1)
tflearn.is_training(False, model.session)
res = model.session.run (HIDDEN_STATE, feed_dict={INPUT.name:X})
return res
encode_decode = model.predict(data_set)
result_autoencoder = []
for i in range(len(encode_decode)):
result_autoencoder.append(encode(i))
In [ ]:
# network parameters
input_dim = len(data_set[0]) # height data input
encoder_hidden_dim = 128
decoder_hidden_dim = 128
latent_dim = 2
batch_size = 10
n_epoch = 10
# paths
TENSORBOARD_DIR='experiment/'
CHECKPOINT_PATH='out_models/'
# encoder
def encode(input_x):
encoder = tflearn.fully_connected(input_x, encoder_hidden_dim, activation='relu')
mu_encoder = tflearn.fully_connected(encoder, latent_dim, activation='linear')
logvar_encoder = tflearn.fully_connected(encoder, latent_dim, activation='linear')
return mu_encoder, logvar_encoder
# decoder
def decode(z):
decoder = tflearn.fully_connected(z, decoder_hidden_dim, activation='relu', restore=False)
x_hat = tflearn.fully_connected(decoder, input_dim, activation='linear', restore=False)
return x_hat
# sampler
def sample(mu, logvar):
epsilon = tf.random_normal(tf.shape(logvar), dtype=tf.float32, name='epsilon')
std_encoder = tf.exp(tf.multiply(0.5, logvar))
z = tf.add(mu, tf.multiply(std_encoder, epsilon))
return z
# loss function(regularization)
def calculate_regularization_loss(mu, logvar):
kl_divergence = -0.5 * tf.reduce_sum(1 + logvar - tf.square(mu) - tf.exp(logvar), reduction_indices=1)
return kl_divergence
# loss function(reconstruction)
def calculate_reconstruction_loss(x_hat, input_x):
mse = tflearn.objectives.mean_square(x_hat, input_x)
return mse
# trainer
def define_trainer(target, optimizer):
trainop = tflearn.TrainOp(loss=target,
optimizer=optimizer,
batch_size=batch_size,
metric=None,
name='vae_trainer')
trainer = tflearn.Trainer(train_ops=trainop,
tensorboard_dir=TENSORBOARD_DIR,
tensorboard_verbose=3,
checkpoint_path=CHECKPOINT_PATH,
max_checkpoints=1)
return trainer
def define_evaluator(trainer, mu, logvar):
evaluator = tflearn.Evaluator([mu, logvar], session=trainer.session)
return evaluator
input_x = tflearn.input_data(shape=(None, len(data_set[0])), name='input_x')
mu, logvar = encode(input_x)
z = sample(mu, logvar)
x_hat = decode(z)
regularization_loss = calculate_regularization_loss(mu, logvar)
reconstruction_loss = calculate_reconstruction_loss(x_hat, input_x)
target = tf.reduce_mean(tf.add(regularization_loss, reconstruction_loss))
optimizer = tflearn.optimizers.AdaDelta()
optimizer = optimizer.get_tensor()
trainer = define_trainer(target, optimizer)
trainer.fit(feed_dicts={input_x: data_set}, val_feed_dicts={input_x: data_set},
n_epoch=n_epoch,
show_metric=False,
snapshot_epoch=False,
shuffle_all=True,
run_id='VAE')
In [8]:
model = TSNE(n_components=2, random_state=0)
In [9]:
output = model.fit_transform(data_set)
In [10]:
col = df[df.columns[0]]
In [11]:
col = df.iloc[:,0]
In [7]:
df.head()
Out[7]:
In [13]:
tsne_result = TSNE(n_components=2, perplexity=100, verbose=2).fit_transform(data_set)
In [ ]:
In [8]:
#Get the labels for names
df_1 = pd.read_csv('Polymers Plus Analytes_SNV_Reduced_SGSmoothed.csv')
names = df_1.iloc[:,0]
labels = []
for name in names:
labels.append(name)
In [9]:
colors = {}
colors['A1'] = 'red'
colors['A2'] = 'blue'
colors['A3'] = 'green'
colors['A4'] = 'orange'
colors['C1'] = 'lawngreen'
colors['C2'] = 'khaki'
colors['C3'] = 'darkviolet'
colors['C4'] = 'indigo'
colors['PC'] = 'cyan'
colors['PE'] = 'teal'
colors['PI'] = 'salmon'
colors['PM'] = 'darkcyan'
colors['PV'] = 'seagreen'
label_color = []
for i in range(df.shape[0]):
label_color.append(colors[labels[i][:2]])
In [16]:
# plot the result
vis_x = tsne_result[:, 0]
vis_y = tsne_result[:, 1]
for i in range(df.shape[0]):
plt.scatter(vis_x[i], vis_y[i], c = label_color[i])
In [10]:
min_overall = 0
for i in range(df.shape[0]):
min_overall = min(min(data_set[i]) , min_overall)
min_overall *= -1
data_set_pos = []
for i in range(df.shape[0]):
data_set_pos.append(data_set[i] + min_overall)
In [70]:
num_component = 7
nmf_model = ProjectedGradientNMF(n_components = num_component, init='random', random_state=0)
W = nmf_model.fit_transform(data_set_pos[22:42]);
H = nmf_model.components_;
In [74]:
clusters = [[] for _ in range(num_component)]
for i in range(len(W)):
cluster_ = np.argmax(W[i])
clusters[cluster_].append(labels[i + 21])
In [75]:
clusters
Out[75]:
In [11]:
labels[21:42]
Out[11]:
In [ ]:
In [ ]:
In [12]:
colors_analyte = {}
colors_analyte['C'] = 'red'
colors_analyte['G'] = 'blue'
colors_analyte['HC'] = 'green'
colors_analyte['Il'] = 'orange'
colors_analyte['Ph'] = 'lawngreen'
colors_analyte['PP'] = 'khaki'
colors_analyte['SA'] = 'darkviolet'
label_color_analyte = []
for i in range(21):
if labels[i][3:4] == 'C' or labels[i][3:4] == 'G':
label_color_analyte.append(colors_analyte[labels[i][3:4]])
else:
label_color_analyte.append(colors_analyte[labels[i][3:5]])
In [46]:
tsne_result = TSNE(n_components=7, perplexity=50, verbose=2).fit_transform(data_set[:21])
# plot the result
vis_x = tsne_result[:21, 0]
vis_y = tsne_result[:21, 1]
for i in range(21):
plt.scatter(vis_x[i], vis_y[i], c = label_color_analyte[i])
In [47]:
initial = [cluster.vq.kmeans(data_set[0:21],i) for i in range(1,10)]
plt.plot([var for (cent,var) in initial])
plt.show()
In [51]:
cent, var = initial[2]
#use vq() to get as assignment for each obs.
assignment,cdist = cluster.vq.vq(data_set[:21],cent)
plt.scatter(vis_x, vis_y, c=assignment)
plt.show()
In [13]:
def bucketing(numbers, num_bucket):
result = [0] * num_bucket
length = len(numbers) / num_bucket
for i in range(num_bucket):
begin = i * length
end = (i + 1) * length
result[i] = max(numbers[begin:end])
return result
In [14]:
#Bucketing the data set into 30 buckets
data_set_new = []
for i in range(df.shape[0]):
data_set_new.append(bucketing(data_set[i], 20))
In [15]:
group = 1
begin_group = group*21
end_group = (group + 1) * 21
print(begin_group, end_group)
In [16]:
initial = [cluster.vq.kmeans(data_set_new[begin_group:end_group],i) for i in range(1,10)]
plt.plot([var for (cent,var) in initial])
plt.title("Group: %d"%(group+1))
plt.xlabel('Number of clusters')
plt.ylabel('Average within cluster sum of squares')
plt.show()
In [17]:
num_clusters = 3
cent, var = initial[num_clusters-1]
#use vq() to get as assignment for each obs.
assignment,cdist = cluster.vq.vq(data_set_new[begin_group:end_group],cent)
#plt.scatter(vis_x, vis_y, c=assignment)
#plt.show()
In [18]:
clusters_kmeans = [[] for _ in range(num_clusters)]
for i in range(len(assignment)):
clusters_kmeans[ assignment[i] ].append(labels[i + (group*21)])
In [19]:
clusters_kmeans
Out[19]:
In [20]:
num_clusters_list = [3,3,2,3,5,3,4,4,2,2,3,2,3]
In [21]:
kmeans_file = open('k-means_fixed_3.txt', 'w')
for group in range(13):
begin_group = group*21
end_group = (group + 1) * 21
if group > 3:
end_group -= 3
if group > 4:
begin_group -= 3
print(begin_group, end_group, labels[begin_group:end_group])
initial = [cluster.vq.kmeans(data_set_new[begin_group:end_group],i) for i in range(1,10)]
plt.plot([var for (cent,var) in initial])
plt.title("Group: %d"%(group+1))
plt.xlabel('Number of clusters')
plt.ylabel('Average within cluster sum of squares')
plt.show()
#num_clusters = num_clusters_list[group]
num_clusters = 3
cent, var = initial[num_clusters-1]
assignment,cdist = cluster.vq.vq(data_set_new[begin_group:end_group],cent)
clusters_kmeans = [[] for _ in range(num_clusters)]
for i in range(len(assignment)):
clusters_kmeans[ assignment[i] ].append(labels[i + (begin_group)])
kmeans_file.writelines('Group: %d\n'%group)
for i in range(num_clusters):
kmeans_file.write("Cluster %d: %s "%(i+1, " , ".join(clusters_kmeans[i])))
kmeans_file.write("\n")
kmeans_file.close()
In [22]:
for group in range(13):
begin_group = group*21
end_group = (group + 1) * 21
if group > 3:
end_group -= 3
if group > 4:
begin_group -= 3
print(group + 1, begin_group, end_group, labels[begin_group:end_group])
In [ ]: