In [1]:
from __future__ import division
import pymongo, pandas, random
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline
plt.rcdefaults()
mpl.style.use('ggplot')
connection = pymongo.MongoClient('localhost', 27017)
# Top5 comunidades em número de usuários na época do dump
# communities = ["stackoverflow", "superuser", "serverfault", "math", "programmers"]
communities = ['ux']
In [2]:
def plot_histogram(female_sample, male_sample, place, name):
bins=range(10)
place.hist(np.log(np.array(female_sample) +1),bins, normed=True, label="females",alpha=0.5, color=mpl.cm.Dark2(0))
place.hist(np.log(np.array(male_sample)+1),bins, normed=True, label="males", alpha=0.5)
place.set_ylim(ymax=1.2)
legend=place.legend(loc='best')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('grey')
place.set_title(name, fontsize=13, fontweight='bold')
place.grid(color='grey', linestyle='-.', linewidth=0.5, alpha=0.7, zorder=0, which="both")
place.set_axis_bgcolor('white')
In [3]:
def plot_histogram2(female_sample, male_sample, place, name):
data = np.hstack([list(female_sample.dropna()), list(male_sample.dropna())])
bins = np.histogram(data)[1]
place.hist(np.log(np.array(female_sample) +1),bins, normed=True, label="females",alpha=0.5, color=mpl.cm.Dark2(0))
place.hist(np.log(np.array(male_sample)+1),bins, normed=True, label="males", alpha=0.5)
legend=place.legend(loc='best')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('grey')
place.set_title(name, fontsize=13, fontweight='bold')
place.grid(color='grey', linestyle='-.', linewidth=0.5, alpha=0.7, zorder=0, which="both")
place.set_axis_bgcolor('white')
In [4]:
def plot_density(female_sample, male_sample, place, name):
males_ = male_sample.dropna()
females_ = female_sample.dropna()
density_female = stats.kde.gaussian_kde(females_)
density_male = stats.kde.gaussian_kde(males_)
minimum = min(min(females_),min(males_))
maximum = max(max(females_),max(males_))
x = np.arange(minimum-1, maximum, .1)
place.plot(x, density_female(x), label="females", color=mpl.cm.Dark2(0))
place.plot(x, density_male(x), label="males")
legend=place.legend(loc='best')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('grey')
place.set_title(name, fontsize=13, fontweight='bold')
place.grid(color='grey', linestyle='-.', linewidth=0.5, alpha=0.7, zorder=0, which="both")
place.set_axis_bgcolor('white')
In [5]:
def plot_cumulative(female_sample, male_sample, place, name):
values, base = np.histogram(female_sample)
cumulative = np.cumsum(values)
values2, base2 = np.histogram(male_sample)
cumulative2 = np.cumsum(values2)
place.set_xlim(xmin=-1, xmax=max(max(base), max(base2)))
place.step(base[:-1], cumulative, label="females", color=mpl.cm.Dark2(0))
place.step(base2[:-1], cumulative2, label="males")
legend=place.legend(loc='best')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('grey')
place.set_title(name, fontsize=13, fontweight='bold')
place.grid(color='grey', linestyle='-.', linewidth=0.5, alpha=0.7, zorder=0, which="both")
place.set_axis_bgcolor('white')
In [6]:
def plot_regular_histogram(female_sample, male_sample, place, name):
# bins=range(10)
place.hist(np.array(female_sample), label="females",alpha=0.5, color=mpl.cm.Dark2(0))
place.hist(np.array(male_sample), label="males", alpha=0.5)
# place.set_ylim(ymax=1.2)
legend=place.legend(loc='best')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('grey')
place.set_title(name, fontsize=13, fontweight='bold')
place.grid(color='grey', linestyle='-.', linewidth=0.5, alpha=0.7, zorder=0, which="both")
place.set_axis_bgcolor('white')
In [7]:
for idx, community in enumerate(communities):
community_db = connection[community]['statistics']
cursor = community_db.find({'contributions_total': {'$gt':0}, 'gender': {'$ne': "Unknown"}},
{u'_id': False, u'gender':True,
'questions_total':True,'answers_total':True,'comments_total':True,'contributions_total':True,
'accepted_rate':True,'mean_utility':True,'questions_avg':True,
'lifetime':True,'activity_freq':True,})
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(10, 15), dpi=300)
fig.suptitle(community, fontsize=15, fontweight='bold')
fig.tight_layout()
fig.subplots_adjust(hspace=.4, wspace=0.3, top=0.93)
df = pandas.DataFrame(list(cursor))
females = df.query("gender == 'Female'")
males = df.query("gender == 'Male'")
plot_histogram(females['questions_total'], males['questions_total'], axes[0][0], u"Número de Perguntas")
plot_histogram(females['answers_total'], males['answers_total'], axes[0][1], u"Número de Respostas")
plot_histogram(females['comments_total'], males['comments_total'], axes[1][0], u"Número de Comentários")
plot_histogram(females['contributions_total'], males['contributions_total'], axes[1][1], u"Número de Contribuições")
plot_density(females['accepted_rate'], males['accepted_rate'], axes[2][0], u"Taxa de Aceitação")
plot_density(females['mean_utility'], males['mean_utility'], axes[2][1], u"Utilidade Média")
plot_density(females['questions_avg'], males['questions_avg'], axes[3][0], u"Média dos Votos das Perguntas")
plot_density(females['activity_freq'], males['activity_freq'], axes[3][1], u"Frequência de Atividade")
# plot_histogram(females['lifetime'], males['lifetime'], axes[3][1], "Tempo de Vida")
plt.savefig("test/"+community+".pdf", format="pdf")
plt.close()
In [ ]: