In [3]:
from __future__ import division
import pymongo, pandas, random
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcdefaults()
mpl.style.use('ggplot')
connection = pymongo.MongoClient('localhost', 27017)
results_db = connection['results']['question_1']
cursor = results_db.find({'questions_pvalue': {'$lt': 0.05}}, {u'_id': False, u'community':True})
communities = list(pandas.DataFrame(list(cursor))['community'])
In [26]:
def plot_histogram(female_sample, male_sample, place, name):
max_data = np.r_[female_sample, male_sample].max()
# bins = np.linspace(0, max_data, max_data + 1)
# bins=50
bins=range(10)
data= [list(female_sample), list(male_sample)]
labels= ["females", "males"]
# place.hist(data,bins, normed=True, label=labels, cumulative=True)
place.hist(np.log(np.array(female_sample) +1),bins, normed=True, label="females",alpha=0.5)
place.hist(np.log(np.array(male_sample)+1),bins, normed=True, label="males", alpha=0.5)
place.set_ylim(ymax=1.2)
place.legend(loc='upper right')
place.set_title(name)
In [27]:
plt.close('all')
rows = len(communities)//2 + 1
fig, axes = plt.subplots(nrows=rows, ncols=2, figsize=(10,60), dpi=200)
fig.tight_layout()
In [28]:
for idx, community in enumerate(communities):
community_db = connection[community]['statistics']
cursor = community_db.find({'$or':
[{'questions_total':{'$gt':0}},
{'answers_total':{'$gt':0}},
{'comments_total':{'$gt':0}}] },
{u'_id': False, u'comments_total':True, u'gender':True,
u'questions_total':True,u'answers_total':True})
df = pandas.DataFrame(list(cursor))
females_ = df.query("gender == 'Female'")['questions_total']
males_ = df.query("gender == 'Male'")['questions_total']
plot_histogram(females_, males_, axes[idx//2][idx%2], community)
plt.show()
In [ ]: