In [2]:

    
import pandas as pd
import numpy as np

df_unknowns = pd.read_csv('unknown.csv')

# Get numerical values
unknowns = df_unknowns.iloc[:, 4:].get_values()

Distribution of unknown columns



In [3]:

    
%matplotlib inline
import matplotlib.pyplot as plt

print '\nPlot the distributions of unknown columns (BSC, GSC, LDS):'
print '\nBSC_1 to BSC_101'

bsc = ['BSC_' + str(i) for i in xrange(1, 102)]
plot = df_unknowns[bsc].plot(kind='hist', alpha=0.5, legend=None)









    



Plot the distributions of unknown columns (BSC, GSC, LDS):

BSC_1 to BSC_101



In [4]:

    
print '\nPlot several random BSC samples:'

fig, axes = plt.subplots(nrows=2, ncols=2)
df_unknowns['BSC_1'].plot(ax=axes[0,0], kind='hist', alpha=0.5)
df_unknowns['BSC_10'].plot(ax=axes[0,1], kind='hist', alpha=0.5)
df_unknowns['BSC_20'].plot(ax=axes[1,0], kind='hist', alpha=0.5)
df_unknowns['BSC_30'].plot(ax=axes[1,1], kind='hist', alpha=0.5)









    



Plot several random BSC samples:






    Out[4]:





<matplotlib.axes._subplots.AxesSubplot at 0x11403c290>



In [33]:

    
print '\nGSC_1 to GSC_119'

gsc = ['GSC_' + str(i) for i in xrange(1, 120)]
plot = df_unknowns[gsc].plot(kind='hist', alpha=0.5, legend=None)









    



GSC_1 to GSC_119



In [36]:

    
print '\nLDS_1 to LDS_79'

lds = ['LDS_' + str(i) for i in xrange(1, 80)]
plot = df_unknowns[lds].plot(kind='hist', alpha=0.5, legend=None)









    



LDS_1 to LDS_79



In [3]:

    
def row_summary(df):
    # Extract column headers
    featNames = list(df.columns.get_values()) 
    # Get row summary (whether number of NaNs in each row)
    row_summary = df.isnull().sum(axis=1)
    # Get incomplete row indices 
    nan_row_inds = list() # incomplete row indices
    for i, x in enumerate(row_summary):
        if x > 0: nan_row_inds.append(i)
    return nan_row_inds

def clean_records(df):
    nan_row_inds = row_summary(df)
    clean_df = df.drop(df.index[nan_row_inds], inplace=False)    
    # Double check for NaNs 
    print 'Is there any NaNs in the clean records?', clean_df.isnull().values.any()
    return clean_df

df = pd.DataFrame.from_csv('Data_Adults_1.csv')
clean_df = clean_records(df)









    



Is there any NaNs in the clean records? False



In [6]:

    
# Keep only numerical values
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
X = clean_df.select_dtypes(include=numerics)

cols2drop = ['Patient_ID', 'GROUP_ID', 'doa', 'Baseline_header_id', 'Concentration_header_id', \
              'Baseline_Reading_id', 'Concentration_Reading_id']
    
# Drop certain columns 
X = X.drop(cols2drop, axis=1, inplace=False)

print 'm =', X.shape[1]

K-Means Clustering



In [15]:

    
from sklearn.cluster import KMeans

k = 4
data = X.get_values().T
kmeans = KMeans(n_clusters=k)
kmeans.fit(data)









    Out[15]:





KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=4, n_init=10,
    n_jobs=1, precompute_distances=True, random_state=None, tol=0.0001,
    verbose=0)



In [16]:

    
labels = kmeans.labels_
centroids = kmeans.cluster_centers_



In [17]:

    
for i in range(k):
    # Extract observations within each cluster
    ds = data[np.where(labels==i)]
    # Plot the observations with symbol o
    plt.plot(ds[:,0], ds[:,1], 'o')
    # Plot the centroids with simbol x
    lines = plt.plot(centroids[i,0], centroids[i,1], 'x')
    plt.setp(lines, ms=8.0)
    plt.setp(lines, mew=2.0)

Inferential Question:

Should we divide the features into groups and analyze their distributions separately? More specifically, can we cluster the features?

Based on the plotted figure above, clustering all the features is not a good idea at this point, since there seem to be no clear cluster boundaries. However, we are able to identify the existence of outliers using this method (clustering).

Principal Component Analysis



In [26]:

    
from sklearn.decomposition import PCA

pca = PCA(n_components=15)
pca.fit(X.get_values())









    Out[26]:





PCA(copy=True, n_components=15, whiten=False)



In [28]:

    
print '\nExplained Variance Ratios:'
print pca.explained_variance_ratio_

plt.plot(pca.explained_variance_ratio_)
plt.ylabel('Variance Explained')
plt.xlabel('Number of Principal Components')









    



Explained Variance Ratios:
[ 0.63891694  0.0892175   0.08381227  0.02315049  0.01735853  0.01439833
  0.01042989  0.00951193  0.00717123  0.00652803  0.00539808  0.00520158
  0.00443164  0.00394098  0.00330307]






    Out[28]:





<matplotlib.text.Text at 0x10c3e9210>

Can we find k most important features to construct our training data, where k is significantly smaller than m?

Given the explained variance ratios printed and plotted above, if we can settle for capturing 90% of variance, then we can find a k such that k = 10 << m = 736.



In [29]:

    
# Looking at what columns are favored by the first two principal dimensions 
print '\nColumns favored by the first principal component:'
pc = pd.DataFrame(pca.components_, columns=X.iloc[:, :].columns).T
pc.sort_values(0, ascending=False)[:6]









    



Columns favored by the first principal component:






    Out[29]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
    
  
  
    
      Baseline_Vermis_9
      0.094380
      0.033614
      -0.083460
      0.138607
      -0.057032
      -0.052472
      0.041185
      0.026639
      -0.041684
      -0.054157
      -0.091318
      0.032815
      0.088523
      0.098256
      0.112633
    
    
      Baseline_Pallidum_R
      0.093015
      0.101915
      -0.068602
      0.046092
      0.006636
      0.016819
      -0.038290
      0.041850
      0.056017
      -0.004603
      0.077882
      -0.023244
      0.015743
      -0.072071
      0.135445
    
    
      Concentration_Vermis_9
      0.091933
      0.003287
      0.085602
      0.146032
      -0.064774
      -0.060172
      0.040250
      0.027106
      -0.034455
      -0.050062
      -0.101807
      0.034831
      0.096725
      0.070654
      0.052451
    
    
      Baseline_Putamen_L
      0.091270
      0.074613
      -0.070461
      0.028729
      0.012304
      0.025879
      -0.044538
      0.006870
      0.033406
      -0.022882
      0.080905
      0.009723
      0.040000
      -0.037924
      0.078674
    
    
      Baseline_Putamen_R
      0.090789
      0.068889
      -0.072930
      0.019189
      0.022099
      0.032056
      -0.051313
      -0.005592
      0.021084
      -0.024017
      0.069760
      0.020282
      0.048993
      -0.027178
      0.085504
    
    
      Baseline_Vermis_10
      0.090773
      0.118392
      -0.058558
      -0.016581
      -0.090030
      -0.101297
      0.099518
      -0.002470
      -0.086883
      -0.023299
      -0.029887
      0.064761
      0.069085
      0.170129
      0.031701



In [30]:

    
print '\nColumns favored by the second principal component:'
pc.sort_values(1, ascending=False)[:6]









    



Columns favored by the second principal component:






    Out[30]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
    
  
  
    
      Baseline_Vermis_10
      0.090773
      0.118392
      -0.058558
      -0.016581
      -0.090030
      -0.101297
      0.099518
      -0.002470
      -0.086883
      -0.023299
      -0.029887
      0.064761
      0.069085
      0.170129
      0.031701
    
    
      Baseline_Pallidum_R
      0.093015
      0.101915
      -0.068602
      0.046092
      0.006636
      0.016819
      -0.038290
      0.041850
      0.056017
      -0.004603
      0.077882
      -0.023244
      0.015743
      -0.072071
      0.135445
    
    
      Baseline_Pallidum_L
      0.086479
      0.099251
      -0.061309
      0.041711
      -0.002832
      0.029691
      -0.041436
      0.047608
      0.027776
      -0.007807
      0.106204
      0.007785
      0.046051
      -0.046204
      0.077411
    
    
      Concentration_Vermis_10
      0.087763
      0.093946
      0.089727
      -0.012100
      -0.094178
      -0.096893
      0.090944
      -0.005725
      -0.084483
      -0.019877
      -0.019485
      0.059458
      0.072231
      0.161771
      0.023914
    
    
      Baseline_Vermis_4_5
      0.084818
      0.090066
      -0.064449
      0.018614
      -0.028008
      -0.001981
      0.083329
      -0.047446
      -0.008348
      0.062056
      -0.017267
      0.009962
      0.021848
      0.037824
      -0.132033
    
    
      Baseline_Cingulum_Post_R
      0.074282
      0.087378
      -0.057223
      0.027408
      0.065913
      0.022071
      -0.013584
      0.049047
      -0.016614
      -0.007625
      0.027834
      -0.044153
      0.024887
      -0.100130
      -0.327702

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14
Baseline_Vermis_9	0.094380	0.033614	-0.083460	0.138607	-0.057032	-0.052472	0.041185	0.026639	-0.041684	-0.054157	-0.091318	0.032815	0.088523	0.098256	0.112633
Baseline_Pallidum_R	0.093015	0.101915	-0.068602	0.046092	0.006636	0.016819	-0.038290	0.041850	0.056017	-0.004603	0.077882	-0.023244	0.015743	-0.072071	0.135445
Concentration_Vermis_9	0.091933	0.003287	0.085602	0.146032	-0.064774	-0.060172	0.040250	0.027106	-0.034455	-0.050062	-0.101807	0.034831	0.096725	0.070654	0.052451
Baseline_Putamen_L	0.091270	0.074613	-0.070461	0.028729	0.012304	0.025879	-0.044538	0.006870	0.033406	-0.022882	0.080905	0.009723	0.040000	-0.037924	0.078674
Baseline_Putamen_R	0.090789	0.068889	-0.072930	0.019189	0.022099	0.032056	-0.051313	-0.005592	0.021084	-0.024017	0.069760	0.020282	0.048993	-0.027178	0.085504
Baseline_Vermis_10	0.090773	0.118392	-0.058558	-0.016581	-0.090030	-0.101297	0.099518	-0.002470	-0.086883	-0.023299	-0.029887	0.064761	0.069085	0.170129	0.031701