In [204]:
%matplotlib inline
import json
import pandas as pd
In [205]:
#contributions = pd.read_json(path_or_buf='../data/EGALITE4.brut.json', orient="columns")
In [546]:
def loadContributions(file, withsexe=False):
contributions = pd.read_json(path_or_buf=file, orient="columns")
rows = [];
rindex = [];
for i in range(0, contributions.shape[0]):
row = {};
row['id'] = contributions['id'][i]
rindex.append(contributions['id'][i])
if (withsexe):
if (contributions['sexe'][i] == 'Homme'):
row['sexe'] = 0
else:
row['sexe'] = 1
for question in contributions['questions'][i]:
if (question.get('Reponse')): # and (question['texte'][0:5] != 'Savez') :
row[question['titreQuestion']+' : '+question['texte']] = 1
for criteres in question.get('Reponse'):
# print(criteres['critere'].keys())
row[question['titreQuestion']+'. (Réponse) '+question['texte']+' -> '+str(criteres['critere'].get('texte'))] = 1
rows.append(row)
df = pd.DataFrame(data=rows)
df.fillna(0, inplace=True)
return df
df = loadContributions('../data/EGALITE1.brut.json', True)
df = df.merge(right=loadContributions('../data/EGALITE2.brut.json'), how='outer', right_on='id', left_on='id')
df = df.merge(right=loadContributions('../data/EGALITE3.brut.json'), how='outer', right_on='id', left_on='id')
df = df.merge(right=loadContributions('../data/EGALITE4.brut.json'), how='outer', right_on='id', left_on='id')
df = df.merge(right=loadContributions('../data/EGALITE5.brut.json'), how='outer', right_on='id', left_on='id')
df = df.merge(right=loadContributions('../data/EGALITE6.brut.json'), how='outer', right_on='id', left_on='id')
df.fillna(0, inplace=True)
df.index = df['id']
df.to_csv('consultation_an.csv', format='%d')
#df.columns = ['Q_' + str(col+1) for col in range(len(df.columns) - 2)] + ['id' , 'sexe']
df.head()
Out[546]:
In [547]:
df = loadContributions('../data/EGALITE4.brut.json', True)
In [548]:
from sklearn.cluster import KMeans
from sklearn import metrics
import numpy as np
X = df.drop('id', axis=1).values
def train_kmeans(nb_clusters, X):
kmeans = KMeans(n_clusters=nb_clusters, random_state=0).fit(X)
return kmeans
#print(kmeans.predict(X))
#kmeans.cluster_centers_
def select_nb_clusters():
perfs = {};
for nbclust in range(2,10):
kmeans_model = train_kmeans(nbclust, X);
labels = kmeans_model.labels_
# from http://scikit-learn.org/stable/modules/clustering.html#calinski-harabaz-index
# we are in an unsupervised model. cannot get better!
# perfs[nbclust] = metrics.calinski_harabaz_score(X, labels);
perfs[nbclust] = metrics.silhouette_score(X, labels);
print(perfs);
return perfs;
df['clusterindex'] = train_kmeans(4, X).predict(X)
#df
perfs = select_nb_clusters();
# result :
# {2: 341.07570462155348, 3: 227.39963334619881, 4: 186.90438345452918, 5: 151.03979976346525, 6: 129.11214073405731, 7: 112.37235520885432, 8: 102.35994869157568, 9: 93.848315820675438}
optimal_nb_clusters = max(perfs, key=perfs.get);
print("optimal_nb_clusters" , optimal_nb_clusters);
In [549]:
km_model = train_kmeans(optimal_nb_clusters, X);
df['clusterindex'] = km_model.predict(X)
lGroupBy = df.groupby(['clusterindex']).mean();
In [550]:
# km_model.__dict__
In [551]:
cluster_profile_counts = df.groupby(['clusterindex']).count();
cluster_profile_means = df.groupby(['clusterindex']).mean();
global_counts = df.count()
global_means = df.mean()
In [552]:
cluster_profile_counts.head()
Out[552]:
In [561]:
#cluster_profile_means.head()
In [554]:
#df.info()
In [555]:
df_profiles = pd.DataFrame();
nbclusters = cluster_profile_means.shape[0]
df_profiles['clusterindex'] = range(nbclusters)
for col in cluster_profile_means.columns:
if(col != "clusterindex"):
df_profiles[col] = np.zeros(nbclusters)
for cluster in range(nbclusters):
df_profiles[col][cluster] = cluster_profile_means[col][cluster]
# row.append(df[col].mean());
df_profiles.head()
#print(df_profiles.columns)
intereseting_columns = {};
for col in df_profiles.columns:
if(col != "clusterindex"):
global_mean = df[col].mean()
diff_means_global = abs(df_profiles[col] - global_mean). max();
# print(col , diff_means_global)
if(diff_means_global > 0.1):
intereseting_columns[col] = True
#print(intereseting_columns)
In [556]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
In [557]:
#cols = [ col for col in cluster_profile_counts.columns]
#cluster_profile_means.ix[0].plot.bar()
Here, the optimal model ihas two clusters , cluster 0 with 399 cases, and 1 with 537 cases.
As this model is based on binary inputs. Given this, the best description of the clusters is by the distribution of zeros and ones of each input (question).
The figure below gives the cluster profiles of this model. Cluster 0 on the left. 1 on the right. The questions invloved as different (highest bars)
In [558]:
interesting = list(intereseting_columns.keys())
df_profiles_sorted = df_profiles[interesting].sort_index(axis=1)
df_profiles_sorted.plot.bar(figsize =(1, 1))
df_profiles_sorted.plot.bar(figsize =(16, 8), legend=False)
Out[558]:
In [559]:
df_profiles_sorted.T
Out[559]:
In [560]:
df_profiles.sort_index(axis=1).T
Out[560]:
In [ ]: