In [1]:
import pandas as pd
import numpy as np
'''
Reduced features
'''
df_feats = pd.read_csv('reduced_data.csv')
df_labels = pd.read_csv('disorders.csv')['Depressed']
df_red = pd.concat([df_feats, df_labels], axis=1)
df_red_depr = df_red.loc[df_red['Depressed'] == 1].drop(['Depressed'], axis=1, inplace=False) # depressed
df_red_not_depr = df_red.loc[df_red['Depressed'] == 0].drop(['Depressed'], axis=1, inplace=False) # not depressed
# Get reduced features
red_depr = df_red_depr.get_values()
red_not_depr = df_red_not_depr.get_values()
In [2]:
'''
Original features
'''
df_patient = pd.read_csv('patient_info.csv')
df_disorders = pd.read_csv('disorders.csv')
df_questionnaire = pd.read_csv('questionnaire.csv')
df_base_concen = pd.read_csv('base_concen.csv')
df_patient = df_patient[['Age', 'Gender_id', 'race_id', 'location_id']]
df_disorders.drop(['Patient_ID', 'ADHD_Type'], axis=1, inplace=True)
df_questionnaire.drop(['Patient_ID', 'BSC_Respondent', 'GSC_Respondent', 'LDS_Respondent'], axis=1, inplace=True)
df_base_concen.drop(['Patient_ID', 'Baseline_header_id', 'Concentration_header_id'], axis=1, inplace=True)
df = pd.concat([df_patient, df_disorders, df_questionnaire, df_base_concen], axis=1)
df_depr = df.loc[df['Depressed'] == 1].drop(['Depressed'], axis=1, inplace=False) # depressed
df_not_depr = df.loc[df['Depressed'] == 0].drop(['Depressed'], axis=1, inplace=False) # not depressed
# Get original features
depr = df_depr.get_values()
not_depr = df_not_depr.get_values()
We assume that depressed samples are sampled according to:
depressed samples: Xi ∼iid FD
non-depressed samples: Xi ∼iid FND
which is both an independent and identical assumption.
We assume the features are not sampled from an identical distribution:
depressed features: Xij ∼ FDj</sub>
non-depressed features: Xij ∼ FNDj</sub>
For independent samples, check matrix rank of a set of randomly selected samples.
Xi ∼iid F
(X1, X2,..., Xn) ∼ F = ∏i=1,...,nFi
Fi = Fj, ∀i, j
For identical samples, check the optimal number of clusters and see if that is 1.
F = ∏i=1,...,n Fi
∏i=1,...,n wi Fi(θ)
For identical features, check the optimal number of clusters and see if that is larger than 1.
Fi = ∏j=1,...,m Fij
∏j=1,...,m wj Fij(θ)
First let's try and determine if the matrix of reduced depressed people is independent
In [3]:
import numpy as np
# We don't want to ensure reproducibility, we want the sampling to be random everytime, this is because only then
# will iterative coverage of the rows of the large matrix be max
# Before we perform any identical-ness determination let's have a look at the shape of the matix
print "Shape of reduced features: " + str(red_depr.shape)
# variable to hold dependent count
dep_count = 0
for loop in range(0,100):
# The number of entries far outnumber the number of reduced features. This almost guarantees linear dependence
# To combat the let us random sample 27 values
randrows = np.random.randint(1205,size=27)
double_red_depr = red_depr[randrows]
# print "Shape of double reduced features: " + str(double_red_depr.shape)
# Now, let's calculate the rank of these matrix, if the rank is less than 27, then there are atleast 2 rows
# that are dependent
rank = np.linalg.matrix_rank(double_red_depr)
if rank != 27:
# print "Interdependent lines with matrank: " + str(rank)
dep_count += 1
print "likely lower bound on dependent patients " + str(dep_count)
Over here our reasoning is this, the probability for 2 lines to be selected twice in 2 differrent sets of 27 from 1205 candidates is pretty low. But when repeated 100 times there is a high chance for double counting. Allowing for this double counting if the number is still less than 5% of the total number of patients we can safely say that our data is mostly independent
Following a similar technique for the set of reduced not depressed people
In [4]:
import numpy as np
# We don't want to ensure reproducibility, we want the sampling to be random everytime, this is because only then
# will iterative coverage of the rows of the large matrix be max
# Before we perform any identical-ness determination let's have a look at the shape of the matix
print "Shape of reduced features: " + str(red_not_depr.shape)
# variable to hold dependent count
dep_count = 0
for loop in range(0,100):
# The number of entries far outnumber the number of reduced features. This almost guarantees linear dependence
# To combat the let us random sample 27 values
randrows = np.random.randint(1205,size=27)
double_red_not_depr = red_not_depr[randrows]
# print "Shape of double reduced features: " + str(double_red_depr.shape)
# Now, let's calculate the rank of these matrix, if the rank is less than 27, then there are atleast 2 rows
# that are dependent
rank = np.linalg.matrix_rank(double_red_not_depr)
if rank != 27:
# print "Interdependent lines with matrank: " + str(rank)
dep_count += 1
print "likely lower bound on dependent patients " + str(dep_count)
Likely number of dependent entries for depressed set: 34/1205
Likely number of dependent entries for not depressed set: 64/1591
Given this info, it might be safe to say that the data entries are by and large independent
In [5]:
import itertools
import matplotlib as mpl
from scipy import linalg
from sklearn.mixture import GMM
%matplotlib inline
import matplotlib.pyplot as plt
def plot_BIS(X, title):
lowest_bic = np.infty
bic = []
n_components_range = range(1, 6)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
for n_components in n_components_range:
# Fit a mixture of Gaussians with EM
gmm = GMM(n_components=n_components, covariance_type=cv_type)
gmm.fit(X)
bic.append(gmm.bic(X))
if bic[-1] < lowest_bic:
lowest_bic = bic[-1]
best_gmm = gmm
bic = np.array(bic)
color_iter = itertools.cycle(['k', 'r', 'g', 'b', 'c', 'm', 'y'])
clf = best_gmm
bars = []
# Plot the BIC scores
fig = plt.figure()
fig.suptitle(title, fontsize=14)
fig.set_size_inches(12, 4)
plt.subplots_adjust(top=0.85)
spl = plt.subplot(2, 1, 1)
for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
xpos = np.array(n_components_range) + .2 * (i - 2)
bars.append(plt.bar(xpos, bic[i * len(n_components_range):
(i + 1) * len(n_components_range)],
width=.2, color=color))
plt.xticks(n_components_range)
plt.ylim([bic.min() * 1.01 - .01 * bic.max(), bic.max()])
plt.title('BIC score per model')
xpos = np.mod(bic.argmin(), len(n_components_range)) + .65 +\
.2 * np.floor(bic.argmin() / len(n_components_range))
plt.text(xpos, bic.min() * 0.97 + .03 * bic.max(), '*', fontsize=14)
spl.set_xlabel('Number of components')
spl.legend([b[0] for b in bars], cv_types, loc='best', prop={'size':10})
In [6]:
# Reduced Feature BIC
plot_BIS(red_depr, 'Reduced Depressed Features')
plot_BIS(red_not_depr, 'Reduced Non-Depressed Features')
Plot BIC (Bayesian information criterion) scores to decide which covariance type to use.
The model with the lowest negation of the BIC (highest BIC score above) is preferred, therefore spherical covariance type is selected.
The definition on wiki leads to some confusion, the explanation can be found here: http://forum.hugin.com/index.php?topic=188.0.
In [7]:
def evaluate_clusters(X):
i = np.linspace(1, 16, 16, dtype='int')
bic = np.array(())
for idx in i:
print "Fitting and evaluating model with " + str(idx) + " clusters."
gmm = GMM(n_components=idx, n_iter=1000, covariance_type='spherical')
gmm.fit(X)
bic = np.append(bic, gmm.bic(X))
plt.figure(figsize=(7, 7))
plt.plot(i, 1.0/bic)
plt.title('BIC')
plt.ylabel('score')
plt.xlabel('number of clusters')
plt.show()
print bic
In [8]:
evaluate_clusters(red_depr)
In [9]:
evaluate_clusters(red_not_depr)
From the two figures above, we can see that the optimal numbers of clusters are 16 and 15 (at least), so our data may not have been sampled identically from one distribution. Here using Gaussian Mixture Model method we proved once again that the identical samples assumption is false.
In [10]:
from sklearn.cluster import KMeans
from mpl_toolkits.axes_grid1 import Grid
def cluster_feats(X, k, title, feat_names):
# Visualize the clustering result
fig = plt.figure()
fig.suptitle(title, fontsize=14, fontweight='bold')
fig.set_size_inches(10, 4)
plt.subplots_adjust(top=0.85)
grid = Grid(fig, rect=111, nrows_ncols=(1, 2),
axes_pad=0.5, label_mode='L',
)
for i, ax in enumerate(grid):
# Fit the data
data = X[i].T
kmeans = KMeans(n_clusters=k)
kmeans.fit(data)
# Get cluster labels and centroids
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
# Plot clusters
for j in range(k):
# Extract observations within each cluster
ds = data[np.where(labels==j)]
# Plot the observations with symbol o
ax.plot(ds[:,0], ds[:,1], 'o')
# Plot the centroids with simbol x
lines = ax.plot(centroids[j,0], centroids[j,1], 'x')
plt.setp(lines, ms=8.0)
plt.setp(lines, mew=2.0)
ax.set_title(feat_names[i])
In [11]:
# Reduced Feature Clustering
red_feats = [red_depr, red_not_depr]
k = 4 # number of clusters
cluster_feats(red_feats, k, 'Reduced Features', ['Depressed', 'Not Depressed'])
# Original Feature Clustering
feats = [depr, not_depr]
k = 3 # number of clusters
cluster_feats(feats, k, 'Original Features', ['Depressed', 'Not Depressed'])
As shown in the figures above, the reduced depressed features and non-depressed features are not identical and both fall into four clusters, while the original depressed features fall into two groups and original non-depressed fall into three groups. Note that the reduced feature vector size is 27 and the original feature vector size is 737.
From above, we can conclude that our not identical features assumption is true.