Inicialization and importing data are at the end of this notebook. For better visualization of the analysis they were placed at the bottom, but it's necessary to run them first so the analysis work as expected. Click here to go there.
In [4]:
females.describe()
Out[4]:
In [5]:
females.median()
Out[5]:
In [6]:
males.describe()
Out[6]:
In [7]:
males.median()
Out[7]:
In [8]:
pyplot.close('all')
histogram(females["reputation"], males["reputation"], 100, "Reputation")
histogram(females[females["reputation"]<= 1000]["reputation"], males[males["reputation"]<= 1000]["reputation"], 100, "Reputation")
histogram(females[females["reputation"]<= 300]["reputation"], males[males["reputation"]<= 300]["reputation"], 100, "Reputation")
pyplot.show()
In [9]:
top_females = females[females["reputation"]> 300]
top_females.describe()
Out[9]:
In [10]:
top_females.median()
Out[10]:
In [11]:
top_males = males[males["reputation"]> 300]
top_males.describe()
Out[11]:
In [12]:
top_males.median()
Out[12]:
In [13]:
common_females = females[females["reputation"] <= 300]
common_females.describe()
Out[13]:
In [14]:
common_females.median()
Out[14]:
In [15]:
common_males = males[males["reputation"] <= 300]
common_males.describe()
Out[15]:
In [16]:
common_males.median()
Out[16]:
It doesn't make sense to verify quality of something that haven't been done. So, we don't verify users who didn't post any answers.
In [17]:
females_acc_rate = females[females['answers_total'] > 0]['accepted_rate']
males_acc_rate = males[males['answers_total'] > 0]['accepted_rate']
In [18]:
show_data_shape(females_acc_rate, males_acc_rate, "norm", 30, "Accepted Rate")
In [20]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_acc_rate, males_acc_rate)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_acc_rate, males_acc_rate)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_acc_rate, males_acc_rate)[1]
In [21]:
top_females_acc_rate = top_females[top_females['answers_total'] > 0]['accepted_rate']
top_males_acc_rate = top_males[top_males['answers_total'] > 0]['accepted_rate']
In [22]:
show_data_shape(top_females_acc_rate, top_males_acc_rate, "expon", 30, "Accepted Rate")
In [35]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_acc_rate, top_males_acc_rate)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_acc_rate, top_males_acc_rate, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_acc_rate, top_males_acc_rate)[1]
In [24]:
common_females_acc_rate = common_females[common_females['answers_total'] > 0]['accepted_rate']
common_males_acc_rate = common_males[common_males['answers_total'] > 0]['accepted_rate']
In [25]:
show_data_shape(common_females_acc_rate, common_males_acc_rate, "expon", 30, "Accepted Rate")
In [26]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_acc_rate, common_males_acc_rate)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_acc_rate, common_males_acc_rate)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_acc_rate, common_males_acc_rate)[1]
It doesn't make sense to verify quality of something that haven't been done. So, we don't verify users who didn't post any answers.
In [27]:
females_mean_utility = females['mean_utility'].dropna()
males_mean_utility = males['mean_utility'].dropna()
In [28]:
show_data_shape(females_mean_utility, males_mean_utility, "expon", 30, "Mean Utility Answers")
In [29]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_mean_utility, males_mean_utility)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_mean_utility, males_mean_utility)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_mean_utility, males_mean_utility)[1]
In [30]:
top_females_mean_utility = top_females['mean_utility'].dropna()
top_males_mean_utility = top_males['mean_utility'].dropna()
In [31]:
show_data_shape(top_females_mean_utility, top_males_mean_utility, "expon", 30, "Mean Utility Answers")
In [36]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_mean_utility, top_males_mean_utility)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_mean_utility, top_males_mean_utility)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_mean_utility, top_males_mean_utility)[1]
In [37]:
common_females_mean_utility = common_females['mean_utility'].dropna()
common_males_mean_utility = common_males['mean_utility'].dropna()
In [38]:
show_data_shape(common_females_mean_utility, common_males_mean_utility, "expon", 30, "Mean Utility Answers")
In [39]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_mean_utility, common_males_mean_utility)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_mean_utility, common_males_mean_utility)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_mean_utility, common_males_mean_utility)[1]
It doesn't make sense to verify quality of something that haven't been done. So, we don't verify users who didn't post any questions.
In [40]:
females_questions_mean = females[females['questions_total'] > 0]['questions_avg']
males_questions_mean = males[males['questions_total'] > 0]['questions_avg']
In [41]:
show_data_shape(females_questions_mean, males_questions_mean, "expon", 30, "Mean Score Questions")
In [42]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_questions_mean, males_questions_mean)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_questions_mean, males_questions_mean)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_questions_mean, males_questions_mean)[1]
In [43]:
top_females_questions_mean = top_females[top_females['questions_total'] > 0]['questions_avg']
top_males_questions_mean = top_males[top_males['questions_total'] > 0]['questions_avg']
In [44]:
show_data_shape(top_females_questions_mean, top_males_questions_mean, "expon", 30, "Mean Score Questions")
In [45]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_questions_mean, top_males_questions_mean)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_questions_mean, top_males_questions_mean)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_questions_mean, top_males_questions_mean)[1]
In [46]:
common_females_questions_mean = common_females[common_females['questions_total'] > 0]['questions_avg']
common_males_questions_mean = common_males[common_males['questions_total'] > 0]['questions_avg']
In [47]:
show_data_shape(common_females_questions_mean, common_males_questions_mean, "expon", 30, "Mean Score Questions")
In [48]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_questions_mean, common_males_questions_mean)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_questions_mean, common_males_questions_mean, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_questions_mean, common_males_questions_mean)[1]
In [49]:
females_reputation = females['reputation']
males_reputation = males['reputation']
In [50]:
show_data_shape(females_reputation, males_reputation, "expon", 50, "Reputation")
In [51]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_reputation, males_reputation)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_reputation, males_reputation)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_reputation, males_reputation)[1]
In [1]:
from __future__ import division
import pymongo, time, pylab, numpy, pandas
from scipy import stats
import matplotlib as mpl
from matplotlib import pyplot
%matplotlib inline
client = pymongo.MongoClient('localhost', 27017)
community = 'math'
stats_db = client[community].statistics
cursor = stats_db.find({'$or': [{'questions_total':{'$gt':0}}, {'answers_total':{'$gt':0}}, {'comments_total':{'$gt':0}}] },
{u'_id': False, u'accepted_rate': True, u'reputation': True,
u'questions_avg': True, u'answers_total': True, u'gender':True,
u'questions_total': True, u'answers_accepted_total': True,
u'mean_utility': True,})
df = pandas.DataFrame(list(cursor))
males = df[df['gender']=='Male']
females = df[df['gender']=='Female']
In [2]:
pyplot.rcdefaults()
mpl.style.use('ggplot')
def histogram(sample1, sample2, bins, aspect):
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
axes[0].hist(list(sample1), bins)
axes[0].set_title(aspect + " by Females - Histogram")
axes[1].hist(list(sample2), bins)
axes[1].set_title(aspect + " by Males - Histogram")
def pdf_plot(sample1, sample2, aspect):
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
# pdf_max = 20
# xs = numpy.linspace(0,pdf_max,200)
# d= stats.gaussian_kde(list(sample1))
# d.covariance_factor = lambda : .25
# d._compute_covariance()
# axes[0].plot(xs,d(xs))
# d= stats.gaussian_kde(list(sample2))
# d.covariance_factor = lambda : .25
# d._compute_covariance()
# axes[1].plot(xs,d(xs))
axes[0] = sample1.plot(ax=axes[0], kind="density")
axes[1] = sample2.plot(ax=axes[1], kind="density")
axes[0].set_title(aspect + " by Females - Density")
axes[1].set_title(aspect + " by Males - Density")
def boxplot(sample1, sample2, aspect):
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
axes[0].boxplot(list(sample1))
axes[0].set_title(aspect + " by Females - Boxplot")
axes[1].boxplot(list(sample2))
axes[1].set_title(aspect + " by Males - Boxplot")
def qq_plot(sample1, sample2, distribution, aspect):
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
pyplot.subplot(121)
stats.probplot(list(sample1), dist=distribution, plot=pyplot)
axes[0].set_title(aspect + " by Females - QQPlot "+ distribution)
pyplot.subplot(122)
stats.probplot(list(sample2), dist=distribution, plot=pyplot)
axes[1].set_title(aspect + " by Male - QQPlot "+ distribution)
In [3]:
def describe(sample1, sample2):
print sample1.describe()
print "Median: ", sample1.median()
print
print sample2.describe()
print "Median: ", sample2.median()
def show_data_shape(sample1, sample2, dist, bins, aspect):
pyplot.close('all')
#histogram
histogram(sample1, sample2, bins, aspect)
#PDF
pdf_plot(sample1, sample2, aspect)
#QQPlot
qq_plot(sample1, sample2, dist, aspect)
#boxplot
boxplot(sample1,sample2, aspect)
pyplot.show()
#Levene
print "Levene's test: ", stats.levene(sample1, sample2)[1]
#skewness
print "Skewness for Females: ", stats.skew(sample1)
print "Skewness for Males: ", stats.skew(sample2)
In [ ]: