In [2]:
%matplotlib inline
import json
import pandas as pd
In [3]:
def loadContributions(file, withsexe=False):
contributions = pd.read_json(path_or_buf=file, orient="columns")
rows = [];
rindex = [];
for i in range(0, contributions.shape[0]):
row = {};
row['id'] = contributions['id'][i]
rindex.append(contributions['id'][i])
if (withsexe):
if (contributions['sexe'][i] == 'Homme'):
row['sexe'] = 0
else:
row['sexe'] = 1
for question in contributions['questions'][i]:
if (question.get('Reponse')) and question['titreQuestion'][-2:] != '38' and question['titreQuestion'][-2:] != '37':
row[question['titreQuestion']+' : '+question['texte']] = 1
for criteres in question.get('Reponse'):
# print(criteres['critere'].keys())
row[question['titreQuestion']+'. (Réponse) '+question['texte']+' -> '+str(criteres['critere'].get('texte'))] = 1
rows.append(row)
df = pd.DataFrame(data=rows)
df.fillna(0, inplace=True)
return df
df = loadContributions('../data/EGALITE6.brut.json', True)
df.fillna(0, inplace=True)
df.index = df['id']
#df.to_csv('consultation_an.csv', format='%d')
#df.columns = ['Q_' + str(col+1) for col in range(len(df.columns) - 2)] + ['id' , 'sexe']
df.head()
Out[3]:
In [4]:
from sklearn.cluster import KMeans
from sklearn import metrics
import numpy as np
X = df.drop('id', axis=1).values
def train_kmeans(nb_clusters, X):
kmeans = KMeans(n_clusters=nb_clusters, random_state=0).fit(X)
return kmeans
#print(kmeans.predict(X))
#kmeans.cluster_centers_
def select_nb_clusters():
perfs = {};
for nbclust in range(2,10):
kmeans_model = train_kmeans(nbclust, X);
labels = kmeans_model.labels_
# from http://scikit-learn.org/stable/modules/clustering.html#calinski-harabaz-index
# we are in an unsupervised model. cannot get better!
# perfs[nbclust] = metrics.calinski_harabaz_score(X, labels);
perfs[nbclust] = metrics.silhouette_score(X, labels);
print(perfs);
return perfs;
df['clusterindex'] = train_kmeans(4, X).predict(X)
#df
perfs = select_nb_clusters();
# result :
# {2: 341.07570462155348, 3: 227.39963334619881, 4: 186.90438345452918, 5: 151.03979976346525, 6: 129.11214073405731, 7: 112.37235520885432, 8: 102.35994869157568, 9: 93.848315820675438}
optimal_nb_clusters = max(perfs, key=perfs.get);
print("optimal_nb_clusters" , optimal_nb_clusters);
In [5]:
km_model = train_kmeans(optimal_nb_clusters, X);
df['clusterindex'] = km_model.predict(X)
lGroupBy = df.groupby(['clusterindex']).mean();
In [6]:
cluster_profile_counts = df.groupby(['clusterindex']).count();
cluster_profile_means = df.groupby(['clusterindex']).mean();
global_counts = df.count()
global_means = df.mean()
In [7]:
cluster_profile_counts.head(10)
Out[7]:
In [8]:
df_profiles = pd.DataFrame();
nbclusters = cluster_profile_means.shape[0]
df_profiles['clusterindex'] = range(nbclusters)
for col in cluster_profile_means.columns:
if(col != "clusterindex"):
df_profiles[col] = np.zeros(nbclusters)
for cluster in range(nbclusters):
df_profiles[col][cluster] = cluster_profile_means[col][cluster]
# row.append(df[col].mean());
df_profiles.head()
#print(df_profiles.columns)
intereseting_columns = {};
for col in df_profiles.columns:
if(col != "clusterindex"):
global_mean = df[col].mean()
diff_means_global = abs(df_profiles[col] - global_mean). max();
# print(col , diff_means_global)
if(diff_means_global > 0.05):
intereseting_columns[col] = True
#print(intereseting_columns)
In [9]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
Here, the optimal model ihas two clusters , cluster 0 with 399 cases, and 1 with 537 cases.
As this model is based on binary inputs. Given this, the best description of the clusters is by the distribution of zeros and ones of each input (question).
The figure below gives the cluster profiles of this model. Cluster 0 on the left. 1 on the right. The questions invloved as different (highest bars)
In [10]:
interesting = list(intereseting_columns.keys())
df_profiles_sorted = df_profiles[interesting].sort_index(axis=1)
df_profiles_sorted.plot.bar(figsize =(1, 1))
df_profiles_sorted.plot.bar(figsize =(16, 8), legend=False)
Out[10]:
In [11]:
df_profiles_sorted.T
Out[11]:
In [12]:
#df_profiles.sort_index(axis=1).T
Deux groupes de personnes émergent
Ces dernières sont plus favorable à être contactée et déclare plus largement pouvoir améliorer ou être en contact avec des personnes ayant subi le problème
En ignorant les deux dernières questions, on obtient deux autres groupes :
Ce dernier groupe ont déclarer plus de motivation à répondre sur d'auters thèmes et pense pouvoir plus aider
In [ ]: