In [1]:
%matplotlib inline
import json
import pandas as pd
In [21]:
def loadContributions(file, withsexe=False):
contributions = pd.read_json(path_or_buf=file, orient="columns")
rows = [];
rindex = [];
for i in range(0, contributions.shape[0]):
row = {};
row['id'] = contributions['id'][i]
rindex.append(contributions['id'][i])
if (withsexe):
if (contributions['sexe'][i] == 'Homme'):
row['sexe'] = 0
else:
row['sexe'] = 1
for question in contributions['questions'][i]:
if (question.get('Reponse')) and question['texte'][0:5] != 'Conna' and question['titreQuestion'][-2:] != '34':
row[question['titreQuestion']+' : '+question['texte']] = 1
for criteres in question.get('Reponse'):
# print(criteres['critere'].keys())
row[question['titreQuestion']+'. (Réponse) '+question['texte']+' -> '+str(criteres['critere'].get('texte'))] = 1
rows.append(row)
df = pd.DataFrame(data=rows)
df.fillna(0, inplace=True)
return df
df = loadContributions('../data/EGALITE5.brut.json', True)
df.fillna(0, inplace=True)
df.index = df['id']
#df.to_csv('consultation_an.csv', format='%d')
#df.columns = ['Q_' + str(col+1) for col in range(len(df.columns) - 2)] + ['id' , 'sexe']
df.head()
Out[21]:
In [22]:
from sklearn.cluster import KMeans
from sklearn import metrics
import numpy as np
X = df.drop('id', axis=1).values
def train_kmeans(nb_clusters, X):
kmeans = KMeans(n_clusters=nb_clusters, random_state=0).fit(X)
return kmeans
#print(kmeans.predict(X))
#kmeans.cluster_centers_
def select_nb_clusters():
perfs = {};
for nbclust in range(2,10):
kmeans_model = train_kmeans(nbclust, X);
labels = kmeans_model.labels_
# from http://scikit-learn.org/stable/modules/clustering.html#calinski-harabaz-index
# we are in an unsupervised model. cannot get better!
# perfs[nbclust] = metrics.calinski_harabaz_score(X, labels);
perfs[nbclust] = metrics.silhouette_score(X, labels);
print(perfs);
return perfs;
df['clusterindex'] = train_kmeans(4, X).predict(X)
#df
perfs = select_nb_clusters();
# result :
# {2: 341.07570462155348, 3: 227.39963334619881, 4: 186.90438345452918, 5: 151.03979976346525, 6: 129.11214073405731, 7: 112.37235520885432, 8: 102.35994869157568, 9: 93.848315820675438}
optimal_nb_clusters = max(perfs, key=perfs.get);
print("optimal_nb_clusters" , optimal_nb_clusters);
In [23]:
km_model = train_kmeans(optimal_nb_clusters, X);
df['clusterindex'] = km_model.predict(X)
lGroupBy = df.groupby(['clusterindex']).mean();
In [24]:
cluster_profile_counts = df.groupby(['clusterindex']).count();
cluster_profile_means = df.groupby(['clusterindex']).mean();
global_counts = df.count()
global_means = df.mean()
In [25]:
cluster_profile_counts.head(10)
Out[25]:
In [26]:
df_profiles = pd.DataFrame();
nbclusters = cluster_profile_means.shape[0]
df_profiles['clusterindex'] = range(nbclusters)
for col in cluster_profile_means.columns:
if(col != "clusterindex"):
df_profiles[col] = np.zeros(nbclusters)
for cluster in range(nbclusters):
df_profiles[col][cluster] = cluster_profile_means[col][cluster]
# row.append(df[col].mean());
df_profiles.head()
#print(df_profiles.columns)
intereseting_columns = {};
for col in df_profiles.columns:
if(col != "clusterindex"):
global_mean = df[col].mean()
diff_means_global = abs(df_profiles[col] - global_mean). max();
# print(col , diff_means_global)
if(diff_means_global > 0.05):
intereseting_columns[col] = True
#print(intereseting_columns)
In [27]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
Here, the optimal model ihas two clusters , cluster 0 with 399 cases, and 1 with 537 cases.
As this model is based on binary inputs. Given this, the best description of the clusters is by the distribution of zeros and ones of each input (question).
The figure below gives the cluster profiles of this model. Cluster 0 on the left. 1 on the right. The questions invloved as different (highest bars)
In [28]:
interesting = list(intereseting_columns.keys())
df_profiles_sorted = df_profiles[interesting].sort_index(axis=1)
df_profiles_sorted.plot.bar(figsize =(1, 1))
df_profiles_sorted.plot.bar(figsize =(16, 8), legend=False)
Out[28]:
In [29]:
df_profiles_sorted.T
Out[29]:
In [30]:
#df_profiles.sort_index(axis=1).T
Deux groupes de personnes émergent
Celles qui le connaissent ont moins l'impression qu'il y a un déficit d'information sur cette mesure, approuve plus la géolocalisation des victimes et connaissent la disposition d'ordonnance à laquelle elles sont plus favorable. Elles sont plus favorables (bien que minoritaires) au fait de réserver l’occupation du logement au conjoint victime des violences. Les personnes qui connaissent pas le téléphone grand danger ont tendance à apporter plus un peu plus d'importance à l'aide juridictionnelle et aux violences psychologiques.
En ignorant la connaissance du téléphone grand danger (et le fait d'avoir répondu à question ouverte), on a 4 groupes :
In [ ]: