In [193]:
# -*- coding: utf-8 -*-
"""
Created on Sat november 05 13:18:15 2016
@author: Sidon
"""
%matplotlib inline
import pandas as pd
import numpy as np
from collections import OrderedDict
from tabulate import tabulate, tabulate_formats
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')
# bug fix for display formats to avoid run time errors
pd.set_option('display.float_format', lambda x:'%f'%x)
usecols = ['country','incomeperperson','alcconsumption','armedforcesrate','breastcancerper100th','co2emissions',
'femaleemployrate','internetuserate','lifeexpectancy','polityscore', 'relectricperperson',
'suicideper100th', 'employrate', 'urbanrate']
# Load from CSV
data0 = pd.read_csv('~/dev/coursera/gapminder.csv', skip_blank_lines=True,
usecols=usecols)
In [194]:
def to_num(list, data):
for dt in list :
data[dt] = pd.to_numeric(data[dt], 'errors=coerce')
return data
In [195]:
def interpret_clusters(n, train):
model=KMeans(n_clusters=n)
model.fit(train)
# plot clusters
pca_2 = PCA(2)
plt.subplots(figsize=(12, 8))
plot_columns = pca_2.fit_transform(train)
sc = plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=model.labels_, s=200)
plt.xlabel('Canonical variable 1')
plt.ylabel('Canonical variable 2')
plt.title('Scatterplot of Canonical Variables for '+str(n)+' Clusters')
plt.show()
return model
In [196]:
columns = ['country','income','alcohol','army','bcancer','co2','female-employ','net-rate','life', 'polity',
'relectricperperson', 'suicideper100th', 'employ','urban']
cluster_cols = columns.copy()
for column in ['country', 'life']:
cluster_cols.remove(column)
#cluster_cols = ['income','alcohol', 'net-rate' ]
In [197]:
# Rename columns for clarity
data0.columns = columns
# converting to numeric values and parsing (numeric invalids=NaN)
data0 = to_num( cluster_cols+['life'], data0 )
# Remove rows with nan values
data0 = data0.dropna(axis=0, how='any')
# Copy dataframe for preserve original
data1 = data0.copy()
In [198]:
# Subset clustering variables
cluster = data1[cluster_cols]
#print (cluster)
In [199]:
# standardize clustering variables to have mean=0 and sd=1
cluster_s=cluster.copy()
for c in cluster_s:
cluster_s[c]=preprocessing.scale(cluster_s[c].astype('float64'))
#print(cluster_s)
In [200]:
# split data into train and test sets
clus_train, clus_test = train_test_split(cluster_s, test_size=.3, random_state=123)
In [201]:
# k-means cluster analysis for 1-9 clusters
from scipy.spatial.distance import cdist
range_clusters=range(1,9)
meandist=[]
for k in range_clusters:
model=KMeans(n_clusters=k)
model.fit(clus_train)
clusassign=model.predict(clus_train)
meandist.append(sum(np.min(cdist(clus_train, model.cluster_centers_, 'euclidean'), axis=1))
/ clus_train.shape[0])
In [202]:
print (meandist)
In [203]:
"""
Plot average distance from observations from the cluster centroid
to use the Elbow Method to identify number of clusters to choose
"""
plt.subplots(figsize=(12, 8))
plt.plot(range_clusters, meandist)
plt.xlabel('Number of clusters')
plt.ylabel('Average distance')
plt.title('Selecting k with the Elbow Method')
plt.show()
In [204]:
# Interpret 3 cluster solution
model = interpret_clusters(6, clus_train)
In [205]:
#model = models[2][0]
clus_train.reset_index(level=0, inplace=True)
cluslist=list(clus_train['index'])
labels=list(model.labels_)
newlist=dict(zip(cluslist, labels))
newclus=pd.DataFrame.from_dict(newlist, orient='index')
newclus.columns = ['cluster']
In [206]:
newclus.reset_index(level=0, inplace=True)
merged_train=pd.merge(clus_train, newclus, on='index')
merged_train.head(n=100)
merged_train.cluster.value_counts()
Out[206]:
In [207]:
clustergrp = merged_train.groupby('cluster').mean()
print ("Clustering variable means by cluster")
print(clustergrp)
In [208]:
# validate clusters in training data by examining cluster differences in GPA using ANOVA
# first have to merge GPA with clustering variables and cluster assignment data
gpa_data=data0['life']
# split GPA data into train and test sets
gpa_train, gpa_test = train_test_split(gpa_data, test_size=.3, random_state=123)
gpa_train1=pd.DataFrame(gpa_train)
gpa_train1.reset_index(level=0, inplace=True)
merged_train_all=pd.merge(gpa_train1, merged_train, on='index')
sub1 = merged_train_all[['life', 'cluster']].dropna()
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
gpamod = smf.ols(formula='life ~ C(cluster)', data=sub1).fit()
print (gpamod.summary())
In [209]:
print ('means for life by cluster')
m1= sub1.groupby('cluster').mean()
print (m1)
In [210]:
print ('standard deviations for life by cluster')
m2= sub1.groupby('cluster').std()
print (m2)
In [211]:
mc1 = multi.MultiComparison(sub1['life'], sub1['cluster'])
res1 = mc1.tukeyhsd()
print(res1.summary())
In [ ]: