Inicialization and importing data are at the end of this notebook. For better visualization of the analysis they were placed at the bottom, but it's necessary to run them first so the analysis work as expected. Click here to go there.
In [4]:
females.describe()
Out[4]:
In [5]:
females.median()
Out[5]:
In [6]:
males.describe()
Out[6]:
In [7]:
males.median()
Out[7]:
In [8]:
pyplot.close('all')
histogram(females["reputation"], males["reputation"], 100, "Reputation")
histogram(females[females["reputation"]<= 1000]["reputation"], males[males["reputation"]<= 1000]["reputation"], 100, "Reputation")
histogram(females[females["reputation"]<= 300]["reputation"], males[males["reputation"]<= 300]["reputation"], 100, "Reputation")
pyplot.show()
In [9]:
top_females = females[females["reputation"]> 300]
top_females.describe()
Out[9]:
In [10]:
top_females.median()
Out[10]:
In [11]:
top_males = males[males["reputation"]> 300]
top_males.describe()
Out[11]:
In [12]:
top_males.median()
Out[12]:
In [13]:
common_females = females[females["reputation"] <= 300]
common_females.describe()
Out[13]:
In [14]:
common_females.median()
Out[14]:
In [15]:
common_males = males[males["reputation"] <= 300]
common_males.describe()
Out[15]:
In [16]:
common_males.median()
Out[16]:
In [17]:
females_questions = females['questions_total']
males_questions = males['questions_total']
In [18]:
show_data_shape(females_questions, males_questions, "expon", 100, "Questions")
In [19]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_questions, males_questions)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_questions, males_questions, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_questions, males_questions)[1]
In [20]:
top_females_questions = top_females['questions_total']
top_males_questions = top_males['questions_total']
In [21]:
show_data_shape(top_females_questions, top_males_questions, "expon", 50, "Questions")
In [22]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_questions, top_males_questions)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_questions, top_males_questions)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_questions, top_males_questions)[1]
In [23]:
common_females_questions = common_females['questions_total']
common_males_questions = common_males['questions_total']
In [24]:
show_data_shape(common_females_questions, common_males_questions, "expon", 50, "Questions")
In [25]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_questions, common_males_questions)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_questions, common_males_questions, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_questions, common_males_questions)[1]
In [26]:
females_answers = females['answers_total']
males_answers = males['answers_total']
In [27]:
show_data_shape(females_answers, males_answers, "expon", 80, "Answers")
In [28]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_answers, males_answers)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_answers, males_answers)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_answers, males_answers)[1]
In [29]:
top_females_answers = top_females['answers_total']
top_males_answers = top_males['answers_total']
In [30]:
show_data_shape(top_females_answers, top_males_answers, "expon", 30, "Answers")
In [31]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_answers, top_males_answers)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_answers, top_males_answers)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_answers, top_males_answers)[1]
In [32]:
common_females_answers = common_females['answers_total']
common_males_answers = common_males['answers_total']
In [33]:
show_data_shape(common_females_answers, common_males_answers, "expon", 30, "Answers")
In [34]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_answers, common_males_answers)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_answers, common_males_answers)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_answers, common_males_answers)[1]
In [35]:
females_comments = females['comments_total']
males_comments = males['comments_total']
In [36]:
show_data_shape(females_comments, males_comments, "expon", 80, "Comments")
In [37]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_comments, males_comments)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_comments, males_comments, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_comments, males_comments)[1]
In [38]:
top_females_comments = top_females['comments_total']
top_males_comments = top_males['comments_total']
In [39]:
show_data_shape(top_females_comments, top_males_comments, "expon", 50, "Comments")
In [40]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_comments, top_males_comments)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_comments, top_males_comments)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_comments, top_males_comments)[1]
In [41]:
common_females_comments = common_females['comments_total']
common_males_comments = common_males['comments_total']
In [42]:
show_data_shape(common_females_comments, common_males_comments, "expon", 80, "Comments")
In [43]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_comments, common_males_comments)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_comments, common_males_comments, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_comments, common_males_comments)[1]
In [44]:
females_contributions = females['contributions_total']
males_contributions = males['contributions_total']
In [45]:
show_data_shape(females_contributions, males_contributions, "expon", 80, "Contributions")
In [46]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_contributions, males_contributions)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_contributions, males_contributions)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_contributions, males_contributions)[1]
In [47]:
top_females_contributions = top_females['contributions_total']
top_males_contributions = top_males['contributions_total']
In [48]:
show_data_shape(top_females_contributions, top_males_contributions, "expon", 50, "Contributions")
In [49]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_contributions, top_males_contributions)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_contributions, top_males_contributions, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_contributions, top_males_contributions)[1]
In [50]:
common_females_contributions = common_females['contributions_total']
common_males_contributions = common_males['contributions_total']
In [51]:
show_data_shape(common_females_contributions, common_males_contributions, "expon", 80, "Contributions")
In [52]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_contributions, common_males_contributions)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_contributions, common_males_contributions)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_contributions, common_males_contributions)[1]
In [1]:
from __future__ import division
import pymongo, time, pylab, numpy, pandas, vincent
from scipy import stats
import matplotlib as mpl
from matplotlib import pyplot
%matplotlib inline
client = pymongo.MongoClient('localhost', 27017)
community = 'math'
stats_db = client[community].statistics
cursor = stats_db.find({'$or': [{'questions_total':{'$gt':0}}, {'answers_total':{'$gt':0}}, {'comments_total':{'$gt':0}}] },
{u'_id': False, u'questions_total': True, u'reputation': True, u'contributions_total':True,
u'comments_total': True, u'answers_total': True, u'gender':True})
df = pandas.DataFrame(list(cursor))
males = df[df['gender']=='Male']
females = df[df['gender']=='Female']
In [2]:
pyplot.rcdefaults()
mpl.style.use('ggplot')
def histogram(sample1, sample2, bins, aspect):
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
axes[0].hist(list(sample1), bins)
axes[0].set_title(aspect + " by Females - Histogram")
axes[1].hist(list(sample2), bins)
axes[1].set_title(aspect + " by Males - Histogram")
def pmf_plot(sample1, sample2, aspect):
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
aux1 = sample1.value_counts()
pmf1 = aux1[aux1 >1].sort_index() / len(sample1)
# pmf1 = sample1.value_counts().sort_index() / len(sample1)
pmf1.plot(kind="bar", ax=axes[0])
aux2 = sample2.value_counts()
pmf2 = aux2[aux2 >1].sort_index() / len(sample2)
# pmf2 = sample2.value_counts().sort_index() / len(sample2)
pmf2.plot(kind="bar", ax=axes[1])
axes[0].set_title(aspect + " by Females - Density")
axes[1].set_title(aspect + " by Males - Density")
axes[0].get_xaxis().set_visible(False)
axes[1].get_xaxis().set_visible(False)
def boxplot(sample1, sample2, aspect):
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
axes[0].boxplot(list(sample1))
axes[0].set_title(aspect + " by Females - Boxplot")
axes[1].boxplot(list(sample2))
axes[1].set_title(aspect + " by Males - Boxplot")
def qq_plot(sample1, sample2, distribution, aspect):
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
pyplot.subplot(121)
stats.probplot(list(sample1), dist=distribution, plot=pyplot)
axes[0].set_title(aspect + " by Females - QQPlot "+ distribution)
pyplot.subplot(122)
stats.probplot(list(sample2), dist=distribution, plot=pyplot)
axes[1].set_title(aspect + " by Male - QQPlot "+ distribution)
In [3]:
def describe(sample1, sample2):
print sample1.describe()
print "Median: ", sample1.median()
print
print sample2.describe()
print "Median: ", sample2.median()
def show_data_shape(sample1, sample2, dist, bins, aspect):
pyplot.close('all')
#histogram
histogram(sample1, sample2, bins, aspect)
#PMF
pmf_plot(sample1, sample2, aspect)
#QQPlot
qq_plot(sample1, sample2, dist, aspect)
#boxplot
boxplot(sample1,sample2, aspect)
pyplot.show()
#Levene
print "Levene's test: ", stats.levene(sample1, sample2)[1]
#skewness
print "Skewness for Females: ", stats.skew(sample1)
print "Skewness for Males: ", stats.skew(sample2)
In [ ]: