In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import seaborn as sns
from pandas.plotting import andrews_curves, parallel_coordinates
from sklearn.cluster import KMeans
from sklearn import preprocessing
import time
In [2]:
#%config InlineBackend.figure_format = 'svg'
%matplotlib inline
In [3]:
np.random.seed(5)
In [4]:
time.asctime()
Out[4]:
In [5]:
downloads = os.path.join(os.path.expanduser('~'), 'Downloads')
early = os.path.join(downloads, '1.csv')
moderate = os.path.join(downloads, '2.csv')
severe = os.path.join(downloads, '3.csv')
In [6]:
scaler = preprocessing.StandardScaler()
In [7]:
early_df = pd.read_csv(early).rename(index=str, columns={"avg": "EarlyAvg"})
moderate_df = pd.read_csv(moderate).rename(index=str, columns={"avg": "ModerateAvg"})
severe_df = pd.read_csv(severe).rename(index=str, columns={"avg": "SevereAvg"})
df = pd.concat(
[
early_df[['Namespace', 'Name', 'EarlyAvg']],
moderate_df['ModerateAvg'],
severe_df['SevereAvg']
],
axis=1
)
df = df[df['EarlyAvg'].notnull()]
df = df[df['EarlyAvg'] != 0]
cols = ['EarlyAvg', 'ModerateAvg', 'SevereAvg']
df.to_csv(os.path.join(os.path.expanduser('~'), 'Desktop', 'time_series_cmpa.csv'))
df.head()
Out[7]:
In [8]:
sns.pairplot(df[['EarlyAvg', 'ModerateAvg', 'SevereAvg']])
plt.show()
Use the pearson correlation over the time series
In [9]:
corr_df = df[cols].T.corr()
Clustering reveals 3 general patterns of biological processes throughout the progressions.
In [10]:
cg = sns.clustermap(corr_df)
plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
plt.savefig(os.path.join(os.path.expanduser('~'), 'Desktop', 'time_series_clustering_ad.pdf'))
plt.show()
Assign classes based on a simple k-means clustering.
In [11]:
km = KMeans(n_clusters=5)
km.fit(df[cols])
df['label'] = km.labels_
Parallel coordinates immediately reveal the groups of patterns in relation of each mechanism to the disease progressions. Andrews curves use fourier analysis to reveal further patterns in the frequency domain.
In [12]:
parallel_coordinates(df[['EarlyAvg', 'ModerateAvg', 'SevereAvg', 'label']], 'label')
plt.savefig(os.path.join(os.path.expanduser('~'), 'Desktop', 'time_series_pc.pdf'))
Retry the whole analysis but min/max normalize each column first
In [13]:
df_norm = df[cols].apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
norm_corr_df = df_norm[cols].T.corr()
In [14]:
sns.pairplot(df_norm[['EarlyAvg', 'ModerateAvg', 'SevereAvg']])
plt.show()
In [16]:
#cg = sns.clustermap(norm_corr_df)
#plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
#plt.show()
In [17]:
km_norm = KMeans(n_clusters=6)
km_norm.fit(df[cols])
df_norm['label'] = km_norm.labels_
In [18]:
plt.title('Parallel Coordinates on Normalized Data')
parallel_coordinates(df_norm, 'label')
plt.show()
Class 0
contains candidate mechanisms whose CMPA scores are neither significant nor do they change much over time. The other class members are enumerated below.
In [19]:
df[df['label'] == 0]
Out[19]:
In [20]:
df[df['label'] == 1]
Out[20]:
In [21]:
df[df['label'] == 2]
Out[21]:
In [22]:
df[df['label'] == 3]
Out[22]:
In [23]:
df[df['label'] == 4]
Out[23]: