In [2]:
import pandas as pd
import numpy as np
df_unknowns = pd.read_csv('unknown.csv')
# Get numerical values
unknowns = df_unknowns.iloc[:, 4:].get_values()
In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
print '\nPlot the distributions of unknown columns (BSC, GSC, LDS):'
print '\nBSC_1 to BSC_101'
bsc = ['BSC_' + str(i) for i in xrange(1, 102)]
plot = df_unknowns[bsc].plot(kind='hist', alpha=0.5, legend=None)
In [4]:
print '\nPlot several random BSC samples:'
fig, axes = plt.subplots(nrows=2, ncols=2)
df_unknowns['BSC_1'].plot(ax=axes[0,0], kind='hist', alpha=0.5)
df_unknowns['BSC_10'].plot(ax=axes[0,1], kind='hist', alpha=0.5)
df_unknowns['BSC_20'].plot(ax=axes[1,0], kind='hist', alpha=0.5)
df_unknowns['BSC_30'].plot(ax=axes[1,1], kind='hist', alpha=0.5)
Out[4]:
In [33]:
print '\nGSC_1 to GSC_119'
gsc = ['GSC_' + str(i) for i in xrange(1, 120)]
plot = df_unknowns[gsc].plot(kind='hist', alpha=0.5, legend=None)
In [36]:
print '\nLDS_1 to LDS_79'
lds = ['LDS_' + str(i) for i in xrange(1, 80)]
plot = df_unknowns[lds].plot(kind='hist', alpha=0.5, legend=None)
In [3]:
def row_summary(df):
# Extract column headers
featNames = list(df.columns.get_values())
# Get row summary (whether number of NaNs in each row)
row_summary = df.isnull().sum(axis=1)
# Get incomplete row indices
nan_row_inds = list() # incomplete row indices
for i, x in enumerate(row_summary):
if x > 0: nan_row_inds.append(i)
return nan_row_inds
def clean_records(df):
nan_row_inds = row_summary(df)
clean_df = df.drop(df.index[nan_row_inds], inplace=False)
# Double check for NaNs
print 'Is there any NaNs in the clean records?', clean_df.isnull().values.any()
return clean_df
df = pd.DataFrame.from_csv('Data_Adults_1.csv')
clean_df = clean_records(df)
In [6]:
# Keep only numerical values
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
X = clean_df.select_dtypes(include=numerics)
cols2drop = ['Patient_ID', 'GROUP_ID', 'doa', 'Baseline_header_id', 'Concentration_header_id', \
'Baseline_Reading_id', 'Concentration_Reading_id']
# Drop certain columns
X = X.drop(cols2drop, axis=1, inplace=False)
print 'm =', X.shape[1]
In [15]:
from sklearn.cluster import KMeans
k = 4
data = X.get_values().T
kmeans = KMeans(n_clusters=k)
kmeans.fit(data)
Out[15]:
In [16]:
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
In [17]:
for i in range(k):
# Extract observations within each cluster
ds = data[np.where(labels==i)]
# Plot the observations with symbol o
plt.plot(ds[:,0], ds[:,1], 'o')
# Plot the centroids with simbol x
lines = plt.plot(centroids[i,0], centroids[i,1], 'x')
plt.setp(lines, ms=8.0)
plt.setp(lines, mew=2.0)
Based on the plotted figure above, clustering all the features is not a good idea at this point, since there seem to be no clear cluster boundaries. However, we are able to identify the existence of outliers using this method (clustering).
In [26]:
from sklearn.decomposition import PCA
pca = PCA(n_components=15)
pca.fit(X.get_values())
Out[26]:
In [28]:
print '\nExplained Variance Ratios:'
print pca.explained_variance_ratio_
plt.plot(pca.explained_variance_ratio_)
plt.ylabel('Variance Explained')
plt.xlabel('Number of Principal Components')
Out[28]:
Given the explained variance ratios printed and plotted above, if we can settle for capturing 90% of variance, then we can find a k such that k = 10 << m = 736.
In [29]:
# Looking at what columns are favored by the first two principal dimensions
print '\nColumns favored by the first principal component:'
pc = pd.DataFrame(pca.components_, columns=X.iloc[:, :].columns).T
pc.sort_values(0, ascending=False)[:6]
Out[29]:
In [30]:
print '\nColumns favored by the second principal component:'
pc.sort_values(1, ascending=False)[:6]
Out[30]: