In [1]:
%matplotlib inline
import json
import pandas as pd
In [31]:
def loadContributions(file, withsexe=False):
contributions = pd.read_json(path_or_buf=file, orient="columns")
rows = [];
rindex = [];
for i in range(0, contributions.shape[0]):
row = {};
row['id'] = contributions['id'][i]
rindex.append(contributions['id'][i])
if (withsexe):
if (contributions['sexe'][i] == 'Homme'):
row['sexe'] = 0
else:
row['sexe'] = 1
for question in contributions['questions'][i]:
if (question.get('Reponse')) and (question['texte'][0:5] != 'Savez') and (question['titreQuestion'][-2:] != '10'):
row[question['titreQuestion']+' : '+question['texte']] = 1
for criteres in question.get('Reponse'):
# print(criteres['critere'].keys())
row[question['titreQuestion']+'. (Réponse) '+question['texte']+' -> '+str(criteres['critere'].get('texte'))] = 1
rows.append(row)
df = pd.DataFrame(data=rows)
df.fillna(0, inplace=True)
return df
df = loadContributions('../data/EGALITE2.brut.json', True)
df.fillna(0, inplace=True)
df.index = df['id']
#df.to_csv('consultation_an.csv', format='%d')
#df.columns = ['Q_' + str(col+1) for col in range(len(df.columns) - 2)] + ['id' , 'sexe']
df.head()
Out[31]:
In [32]:
from sklearn.cluster import KMeans
from sklearn import metrics
import numpy as np
X = df.drop('id', axis=1).values
def train_kmeans(nb_clusters, X):
kmeans = KMeans(n_clusters=nb_clusters, random_state=0).fit(X)
return kmeans
#print(kmeans.predict(X))
#kmeans.cluster_centers_
def select_nb_clusters():
perfs = {};
for nbclust in range(2,10):
kmeans_model = train_kmeans(nbclust, X);
labels = kmeans_model.labels_
# from http://scikit-learn.org/stable/modules/clustering.html#calinski-harabaz-index
# we are in an unsupervised model. cannot get better!
# perfs[nbclust] = metrics.calinski_harabaz_score(X, labels);
perfs[nbclust] = metrics.silhouette_score(X, labels);
print(perfs);
return perfs;
df['clusterindex'] = train_kmeans(4, X).predict(X)
#df
perfs = select_nb_clusters();
# result :
# {2: 341.07570462155348, 3: 227.39963334619881, 4: 186.90438345452918, 5: 151.03979976346525, 6: 129.11214073405731, 7: 112.37235520885432, 8: 102.35994869157568, 9: 93.848315820675438}
optimal_nb_clusters = max(perfs, key=perfs.get);
print("optimal_nb_clusters" , optimal_nb_clusters);
In [33]:
km_model = train_kmeans(optimal_nb_clusters, X);
df['clusterindex'] = km_model.predict(X)
lGroupBy = df.groupby(['clusterindex']).mean();
In [34]:
cluster_profile_counts = df.groupby(['clusterindex']).count();
cluster_profile_means = df.groupby(['clusterindex']).mean();
global_counts = df.count()
global_means = df.mean()
In [35]:
cluster_profile_counts.head(10)
Out[35]:
In [36]:
df_profiles = pd.DataFrame();
nbclusters = cluster_profile_means.shape[0]
df_profiles['clusterindex'] = range(nbclusters)
for col in cluster_profile_means.columns:
if(col != "clusterindex"):
df_profiles[col] = np.zeros(nbclusters)
for cluster in range(nbclusters):
df_profiles[col][cluster] = cluster_profile_means[col][cluster]
# row.append(df[col].mean());
df_profiles.head()
#print(df_profiles.columns)
intereseting_columns = {};
for col in df_profiles.columns:
if(col != "clusterindex"):
global_mean = df[col].mean()
diff_means_global = abs(df_profiles[col] - global_mean). max();
# print(col , diff_means_global)
if(diff_means_global > 0.05):
intereseting_columns[col] = True
#print(intereseting_columns)
In [37]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
Here, the optimal model ihas two clusters , cluster 0 with 399 cases, and 1 with 537 cases.
As this model is based on binary inputs. Given this, the best description of the clusters is by the distribution of zeros and ones of each input (question).
The figure below gives the cluster profiles of this model. Cluster 0 on the left. 1 on the right. The questions invloved as different (highest bars)
In [38]:
interesting = list(intereseting_columns.keys())
df_profiles_sorted = df_profiles[interesting].sort_index(axis=1)
df_profiles_sorted.plot.bar(figsize =(1, 1))
df_profiles_sorted.plot.bar(figsize =(16, 8), legend=False)
Out[38]:
In [39]:
df_profiles_sorted.T
Out[39]:
In [40]:
#df_profiles.sort_index(axis=1).T
De prime abord, il y a deux populations :
Pour les personnes qui savent, lorsqu'elles ont répondu à la question 3, elles sont plus informées
En ignorant ces questions sur la connaissance des participants, apparait 325 personnes qui ont apportée des contributions écriture (contre 611 qui ne l'ont pas fait)
Ce dernier groupe de personnes est plus critique sur la capacité des télévisions à luttre contre les inégalités FH et sur les dispositif de signalement internet (+10 point à Pas du tout respectée pour les Q4, Q7). Etonnamment, ils sont moins sélectionné la réponses sur les stéréotype (Q2) mais ont été plus sensible à la diffusion d'oeuvre créées par des femmes (Q5)
Enfin, un dernier groupe de population apparait en omettant cette différence :
In [ ]: