In [1]:
%matplotlib inline
import pymysql.cursors
import json
import pandas as pd
# Connect to the database
connection = pymysql.connect(host='localhost',
user='consultation',
password='consultation',
db='consultation',
charset='latin1',
cursorclass=pymysql.cursors.DictCursor)
In [108]:
results = []
affirmations = [
"des analyses ou proposition d'ordre général",
"des prises de position contre une mesure existante",
"des prises de position en faveur d'une mesure existante",
"des propositions de nouvelles mesures",
"des questions ou demandes d'information",
"l'évocation d'expérience(s) personnelle(s)",
"une analyse des causes du problème",
"une description d'un problème"]
affirmations_short = [
'analyse ou proposit°',
'contre 1 mesure',
'pour 1 mesure',
'nouvelles mesures',
'questions',
'XP personnelle',
'analyse des causes',
'description pb'
]
usage = ["cite", "prend en compte", "recois", "rien", "éclaircir"]
usage_label = ["cite", "prend en compte", "reçoit", "ne fait rien", "demande à éclaircir"]
def build_result(row):
data = json.loads(row['data'])
if type(data) != str:
if 'affirmations' in data:
row['affirmations'] = data['affirmations']
for p, p_short in zip(affirmations, affirmations_short):
if p in data['affirmations']:
row[p_short] = 1
else:
row[p_short] = 0
if 'original' in data:
row['original'] = 1 if "oui" in data['original'] else 0
if 'usage' in data :
for p, p_label in zip(usage, usage_label):
if p in data['usage']:
row[p_label] = 1
else:
row[p_label] = 0
row.pop('data')
return row
with connection.cursor() as cursor:
sql = "SELECT `document_id`, `data`, documents.source, documents.theme, documents.question, documents.text FROM `tasks` INNER JOIN `documents` ON documents.id = tasks.document_id"
cursor.execute(sql)
crowdsourced = [build_result(row) for row in cursor.fetchall()]
In [109]:
df = pd.DataFrame(data=crowdsourced)
df.fillna(0, inplace=True)
df.to_csv('crowdsourcing_consultation_an.csv', float_format='%.0f')
df.head()
Out[109]:
In [110]:
affirmations_total = df[affirmations_short].sum() / df.shape[0]
affirmations_total.sort_values(ascending=False, inplace=True)
plot = affirmations_total.plot(
kind='bar',
title='Éléments identifiés par les rapporteurs')
plot.set_ylabel('Pourcentage de contributions')
affirmations_total.sum()
Out[110]:
In [111]:
original = df[['original']].sum() / df.shape[0]
original
Out[111]:
In [112]:
usage_total = df[usage_label].sum() / df.shape[0]
usage_total.sort_values(inplace=True, ascending=False)
plot = usage_total.plot(
kind='bar',
title='Les actions à mener sur les contributions selon les rapporteurs')
plot.set_ylabel('Pourcentage de contributions')
Out[112]:
In [113]:
gp_by_document = df.groupby('document_id')
count_by_document = gp_by_document.document_id.count()
count_by_document.sort_values(ascending=False, inplace=True)
plot = count_by_document.plot(
kind='hist',
normed=True,
title='Nombre de fois qu\'une contribution a été analysée')
plot.set_ylabel('Pourcentage des contributions')
Out[113]:
In [114]:
def validation_method_1(data):
# méthode en production : éléments + originalité
return data[affirmations_short + ['original']].duplicated().sum() > 2
def validation_method_2(data):
# seulement sur les éléments
return data[affirmations_short].duplicated().sum() > 2
def validation_method_3(data):
# ok si au moins un élement est retrouvé 3x + originalité
for p in affirmations_short:
if data[[p, 'original']].duplicated().sum() > 2:
return True
return False
score = {'method_1': 0, 'method_2': 0, 'method_3': 0}
document_count = len(df.document_id.unique())
for _, group in gp_by_document:
if validation_method_1(group):
score['method_1'] += 1 / document_count
if validation_method_2(group):
score['method_2'] += 1 / document_count
if validation_method_3(group):
score['method_3'] += 1 / document_count
In [115]:
plot = pd.DataFrame(data=[score]).plot(
kind='bar',
title='Pourcentage de contributions validées suivant les 3 méthodes')
plot.set_ylabel('Pourcentage des contributions validées')
Out[115]:
In [116]:
score
Out[116]:
In [141]:
#"cite", "prend en compte", "reçoit", "ne fait rien", "demande à éclaircir"
df['usage'] = df['reçoit'] * 5 + df['cite'] * 4 + df['demande à éclaircir'] * 3 + df['prend en compte'] * 2
resultat_crowdsource = df[['document_id','usage','original','analyse ou proposit°','contre 1 mesure','pour 1 mesure','nouvelles mesures','questions','XP personnelle','analyse des causes','description pb']].groupby('document_id').mean().sort_values(by='usage', ascending=False)
resultat_crowdsource['document_id'] = resultat_crowdsource.index
resultat_crowdsource.head()
Out[141]:
In [3]:
with connection.cursor() as cursor:
sql = "SELECT id as document_id, source, theme, question, text FROM documents"
cursor.execute(sql)
documents = pd.DataFrame(data=cursor.fetchall())
documents.index = documents['document_id']
documents.head()
Out[3]:
In [162]:
documents_crowdsource = documents.merge(right=resultat_crowdsource,right_on='document_id', left_on='document_id')
documents_crowdsource.sort_values(by='usage', ascending=False, inplace=True)
documents_crowdsource.to_csv('crowdsource_resultat_analyses.csv')
documents_crowdsource.head()
Out[162]:
In [4]:
with connection.cursor() as cursor:
sql = "SELECT document_id, synthese FROM tasks WHERE length(synthese) > 3"
cursor.execute(sql)
syntheses = pd.DataFrame(data=cursor.fetchall())
syntheses['affirmation'] = syntheses[syntheses['synthese'] != '']['synthese'].apply(lambda s: s.split(' : ')[0])
syntheses['synthese'] = syntheses[syntheses['synthese'] != '']['synthese'].apply(lambda s: " : ".join(s.split(' : ')[1:]))
documents_crowdsource = documents.merge(right=syntheses,right_on='document_id', left_on='document_id')
documents_crowdsource.to_csv('crowdsource_resultat_syntheses.csv')
documents_crowdsource['synthese'] = documents_crowdsource['synthese']+" "
documents_crowdsource.head()
Out[4]:
In [13]:
documents_crowdsource['synthese'] = documents_crowdsource['synthese']+". "
documents_text = documents_crowdsource[['document_id','synthese']].groupby('document_id').sum()
documents_text['text'] = documents['text']
documents_text['tout'] = documents['text']+' BR '+documents_text['synthese']
documents_text['document_id'] = documents_text.index
documents_text = pd.DataFrame(data=documents_text.values, columns=['synthese', 'text','tout', 'document_id'])
documents_text.head()
Out[13]:
TFIDF avec stop words français :
In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = documents_text['tout']
vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 3), stop_words=[ 'BR', "alors", "au", "aucuns", "aussi", "autre", "avant", "avec", "avoir", "bon", "car", "ce", "cela", "ces", "ceux", "chaque", "ci", "comme", "comment", "dans", "des", "du", "dedans", "dehors", "depuis", "devrait", "doit", "donc", "dos", "début", "elle", "elles", "en", "encore", "essai", "est", "et", "eu", "fait", "faites", "fois", "font", "hors", "ici", "il", "ils", "je", "juste", "la", "le", "les", "leur", "là", "ma", "maintenant", "mais", "mes", "mine", "moins", "mon", "mot", "même", "ni", "nommés", "notre", "nous", "ou", "où", "par", "parce", "pas", "peut", "peu", "plupart", "pour", "pourquoi", "quand", "que", "quel", "quelle", "quelles", "quels", "qui", "sa", "sans", "ses", "seulement", "si", "sien", "son", "sont", "sous", "soyez", "sujet", "sur", "ta", "tandis", "tellement", "tels", "tes", "ton", "tous", "tout", "trop", "très", "tu", "voient", "vont", "votre", "vous", "vu", "ça", "étaient", "état", "étions", "été", "être"])
X = vectorizer.fit_transform(corpus)
X = vectorizer.fit_transform(documents_text['text'])
idf = vectorizer.idf_
tfidf = pd.DataFrame(data=list(zip(vectorizer.get_feature_names(), idf)), columns=['mot', 'interet'])
mots = tfidf[tfidf['interet'] > 3].sort_values(by='interet', ascending=False)['mot']
tfidf.sort_values(by='interet', ascending=False).head()
Out[17]:
In [18]:
documents_mots = pd.DataFrame(data=X.todense(), columns=vectorizer.get_feature_names())
documents_mots.head()
Out[18]:
In [19]:
documents_mots[mots.values].sum().sort_values(ascending=False)
Out[19]:
In [13]:
In [ ]: