Inicialization and importing data are at the end of this notebook. For better visualization of the analysis they were placed at the bottom, but it's necessary to run them first so the analysis work as expected. Click here to go there.
In [5]:
merged.describe()
Out[5]:
In [6]:
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
axes[0].plot(merged['contrib_females'])
axes[0].set_title("Absolute number of female's contributions")
axes[1].plot(merged['contrib_males'])
axes[1].set_title("Absolute number of male's contributions")
Out[6]:
In [7]:
fig, ax = pyplot.subplots(figsize=(15, 7))
pyplot.plot(merged[['male_prop', 'female_prop']])
ax.set_title("Proportion of Contributions")
ax.legend(["Male","Female"],bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0.)
Out[7]:
In [8]:
merged['semester'] = [1,2,3,4,5,6,7,8]
import statsmodels.formula.api as smf
result_female = smf.ols(formula="female_prop ~ semester", data=merged).fit()
result_female.summary()
Out[8]:
In [9]:
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
fig = sm.graphics.plot_fit(result_female, "semester", ax=axes[0])
fig = sm.graphics.plot_ccpr(result_female, "semester", ax=axes[1])
In [13]:
result_male = smf.ols(formula="male_prop ~ semester", data=merged).fit()
result_male.summary()
Out[13]:
In [14]:
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
fig = sm.graphics.plot_fit(result_male, "semester", ax=axes[0])
fig = sm.graphics.plot_ccpr(result_male, "semester", ax=axes[1])
In [1]:
from __future__ import division
import pymongo, time, pylab, numpy, pandas, math
from scipy import stats
import matplotlib as mpl
from matplotlib import pyplot
import statsmodels.api as sm
%matplotlib inline
mpl.style.use('ggplot')
# pyplot.rcdefaults()
client = pymongo.MongoClient('localhost', 27017)
community = 'math'
stats_db = client[community].statistics
pipeline = [
{'$match':{'$or': [{'questions_total':{'$gt':0}}, {'answers_total':{'$gt':0}},
{'comments_total':{'$gt':0}}],
'gender': {'$ne': "Unknown"} }},
{'$unwind': '$dates'},
{'$project': {'gender':1, 'dates':1}}
]
cursor = stats_db.aggregate(pipeline, cursor={})
df = pandas.DataFrame(list(cursor))
In [2]:
indexed_df = df.set_index(['dates'])
males = indexed_df[indexed_df['gender']=='Male']
females = indexed_df[indexed_df['gender']=='Female']
In [3]:
from dateutil.relativedelta import *
import datetime
def aggregate_semesters(df):
maxi = max(df.dates).date()
begin = min(df.dates).date()
end = begin + relativedelta(months=+6)
return_df = pandas.DataFrame(data={})
while(begin <= maxi):
d = {"semester": begin,
"contrib_males": len(males[str(begin):str(end)].index),
"contrib_females": len(females[str(begin):str(end)].index),
"contrib": len(indexed_df[str(begin):str(end)].index)
}
return_df = return_df.append(d, ignore_index=True)
begin = end + relativedelta(days=+1)
end = begin + relativedelta(months=+6)
return return_df
def male_proportion(row):
return row['contrib_males'] / row['contrib']
def female_proportion(row):
return row['contrib_females'] / row['contrib']
In [4]:
#aggregates data by semester
merged = aggregate_semesters(df)
#indexes dataframe by date
merged = merged.set_index(['semester'])
#calculating proportion of contributions by gender
merged['male_prop'] = merged.apply(male_proportion, axis=1)
merged['female_prop'] = merged.apply(female_proportion, axis=1)
In [ ]: