Community: Mathematics

Inicialization and importing data are at the end of this notebook. For better visualization of the analysis they were placed at the bottom, but it's necessary to run them first so the analysis work as expected. Click here to go there.

Data Summary

Women


In [4]:
females.describe()


Out[4]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 633.000000 633.000000 633.000000 161.000000 633.000000 633.000000 633.000000
mean 0.128670 2.690363 7.385466 -0.310565 3.486357 9.857820 387.303318
std 0.270712 21.932986 48.434478 0.753054 14.726966 17.905307 1958.974331
min 0.000000 0.000000 0.000000 -2.132227 -1.000000 0.000000 50.000000
25% 0.000000 0.000000 0.000000 -0.916840 0.750000 1.000000 83.000000
50% 0.000000 0.000000 0.000000 -0.453412 1.750000 4.000000 116.000000
75% 0.058824 1.000000 2.000000 0.100167 3.250000 11.000000 198.000000
max 1.000000 508.000000 1057.000000 2.873922 356.000000 178.000000 45388.000000

In [5]:
females.median()


Out[5]:
accepted_rate               0.000000
answers_accepted_total      0.000000
answers_total               0.000000
mean_utility               -0.453412
questions_avg               1.750000
questions_total             4.000000
reputation                116.000000
dtype: float64

Men


In [6]:
males.describe()


Out[6]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 7492.000000 7492.000000 7492.000000 2191.000000 7492.000000 7492.000000 7492.000000
mean 0.144217 8.853043 21.642285 -0.270100 2.778413 6.569541 868.969434
std 0.263682 91.971931 191.770553 0.780461 5.692874 18.395609 6045.880132
min 0.000000 0.000000 0.000000 -1.857143 -12.000000 0.000000 50.000000
25% 0.000000 0.000000 0.000000 -0.918454 0.000000 1.000000 101.000000
50% 0.000000 0.000000 1.000000 -0.464991 1.600000 2.000000 121.000000
75% 0.242058 1.000000 3.000000 0.180160 3.166667 5.000000 242.250000
max 1.000000 4828.000000 8667.000000 3.681959 170.000000 650.000000 221779.000000

In [7]:
males.median()


Out[7]:
accepted_rate               0.000000
answers_accepted_total      0.000000
answers_total               1.000000
mean_utility               -0.464991
questions_avg               1.600000
questions_total             2.000000
reputation                121.000000
dtype: float64

Top contributors


In [8]:
pyplot.close('all')
histogram(females["reputation"], males["reputation"], 100, "Reputation")
histogram(females[females["reputation"]<= 1000]["reputation"], males[males["reputation"]<= 1000]["reputation"], 100, "Reputation")
histogram(females[females["reputation"]<= 300]["reputation"], males[males["reputation"]<= 300]["reputation"], 100, "Reputation")
pyplot.show()


Top Women


In [9]:
top_females = females[females["reputation"]> 300]
top_females.describe()


Out[9]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 123.000000 123.000000 123.000000 47.000000 123.000000 123.000000 123.000000
mean 0.280473 12.983740 34.398374 -0.140989 3.851004 25.650407 1524.268293
std 0.277420 48.559915 105.940266 0.831035 5.975449 33.073436 4272.204505
min 0.000000 0.000000 0.000000 -1.372813 0.000000 0.000000 302.000000
25% 0.000000 0.000000 1.000000 -0.775685 1.264245 4.000000 363.500000
50% 0.250000 2.000000 5.000000 -0.190794 2.638298 14.000000 504.000000
75% 0.483160 8.000000 27.500000 0.184591 4.200341 34.000000 1341.500000
max 1.000000 508.000000 1057.000000 2.873922 51.000000 178.000000 45388.000000

In [10]:
top_females.median()


Out[10]:
accepted_rate               0.250000
answers_accepted_total      2.000000
answers_total               5.000000
mean_utility               -0.190794
questions_avg               2.638298
questions_total            14.000000
reputation                504.000000
dtype: float64

Top Men


In [11]:
top_males = males[males["reputation"]> 300]
top_males.describe()


Out[11]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 1683.000000 1683.000000 1683.000000 605.000000 1683.000000 1683.000000 1683.000000
mean 0.312377 38.579323 92.901961 -0.129997 4.761064 19.014854 3451.025550
std 0.224611 191.129956 396.505569 0.704042 9.673287 35.062562 12416.953799
min 0.000000 0.000000 0.000000 -1.837117 0.000000 0.000000 301.000000
25% 0.150000 1.000000 5.000000 -0.659126 1.500000 2.000000 421.500000
50% 0.323232 5.000000 15.000000 -0.228141 2.909091 8.000000 751.000000
75% 0.450000 17.000000 50.000000 0.275737 5.133333 22.000000 1965.000000
max 1.000000 4828.000000 8667.000000 2.468788 170.000000 650.000000 221779.000000

In [12]:
top_males.median()


Out[12]:
accepted_rate               0.323232
answers_accepted_total      5.000000
answers_total              15.000000
mean_utility               -0.228141
questions_avg               2.909091
questions_total             8.000000
reputation                751.000000
dtype: float64

Common women contributors


In [13]:
common_females = females[females["reputation"] <= 300]
common_females.describe()


Out[13]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 510.000000 510.000000 510.000000 114.000000 510.000000 510.000000 510.000000
mean 0.092058 0.207843 0.870588 -0.380477 3.398413 6.049020 113.094118
std 0.256195 0.574454 1.963047 0.710585 16.146064 7.815879 50.373253
min 0.000000 0.000000 0.000000 -2.132227 -1.000000 0.000000 50.000000
25% 0.000000 0.000000 0.000000 -0.995145 0.643908 1.000000 74.000000
50% 0.000000 0.000000 0.000000 -0.500692 1.527778 3.000000 105.000000
75% 0.000000 0.000000 1.000000 0.000000 3.000000 8.000000 136.750000
max 1.000000 4.000000 16.000000 1.414214 356.000000 73.000000 298.000000

In [14]:
common_females.median()


Out[14]:
accepted_rate               0.000000
answers_accepted_total      0.000000
answers_total               0.000000
mean_utility               -0.500692
questions_avg               1.527778
questions_total             3.000000
reputation                105.000000
dtype: float64

Common men contributors


In [15]:
common_males = males[males["reputation"] <= 300]
common_males.describe()


Out[15]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 5809.000000 5809.000000 5809.000000 1586.000000 5809.000000 5809.000000 5809.000000
mean 0.095497 0.240661 0.996729 -0.323545 2.203993 2.963849 120.888793
std 0.253963 0.657856 2.194680 0.801489 3.637619 4.747850 46.529120
min 0.000000 0.000000 0.000000 -1.857143 -12.000000 0.000000 50.000000
25% 0.000000 0.000000 0.000000 -1.000000 0.000000 1.000000 101.000000
50% 0.000000 0.000000 0.000000 -0.554213 1.166667 1.000000 111.000000
75% 0.000000 0.000000 1.000000 0.103371 2.750000 3.000000 136.000000
max 1.000000 7.000000 41.000000 3.681959 56.000000 77.000000 300.000000

In [16]:
common_males.median()


Out[16]:
accepted_rate               0.000000
answers_accepted_total      0.000000
answers_total               0.000000
mean_utility               -0.554213
questions_avg               1.166667
questions_total             1.000000
reputation                111.000000
dtype: float64

Second Question: The contributions made by both genders are perceived with the same quality by the community ?

Hypothesis 1: Both genders have the same acceptance rate.

H0: acceptanceRate(Males) = acceptanceRate(Females);

H1: acceptanceRate(Males) != acceptanceRate(Females).

Data

It doesn't make sense to verify quality of something that haven't been done. So, we don't verify users who didn't post any answers.


In [17]:
females_acc_rate = females[females['answers_total'] > 0]['accepted_rate']
males_acc_rate = males[males['answers_total'] > 0]['accepted_rate']

The data's shape


In [18]:
show_data_shape(females_acc_rate, males_acc_rate, "norm", 30, "Accepted Rate")


Levene's test:  0.0745382705961
Skewness for Females:  0.977747918161
Skewness for Males:  0.989887017827

Hypothesis test


In [20]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_acc_rate, males_acc_rate)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_acc_rate, males_acc_rate)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_acc_rate, males_acc_rate)[1]


Two-sample Kolmogorov-Smirnov test:  0.60203965179
Two-sample unpaired t-test:  0.500845759825
Two-sample Mann Whitney U test:  0.831373005814

Looking at the top contributors


In [21]:
top_females_acc_rate = top_females[top_females['answers_total'] > 0]['accepted_rate']
top_males_acc_rate = top_males[top_males['answers_total'] > 0]['accepted_rate']

The data's shape


In [22]:
show_data_shape(top_females_acc_rate, top_males_acc_rate, "expon", 30, "Accepted Rate")


Levene's test:  0.00128821547976
Skewness for Females:  0.683407631269
Skewness for Males:  0.583352766716

Hypothesis test


In [35]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_acc_rate, top_males_acc_rate)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_acc_rate, top_males_acc_rate, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_acc_rate, top_males_acc_rate)[1]


Two-sample Kolmogorov-Smirnov test:  0.265989745973
Two-sample unpaired t-test:  0.831296026267
Two-sample Mann Whitney U test:  0.747456900038

Looking at the common contributors


In [24]:
common_females_acc_rate = common_females[common_females['answers_total'] > 0]['accepted_rate']
common_males_acc_rate = common_males[common_males['answers_total'] > 0]['accepted_rate']

The data's shape


In [25]:
show_data_shape(common_females_acc_rate, common_males_acc_rate, "expon", 30, "Accepted Rate")


Levene's test:  0.411875597209
Skewness for Females:  1.11503475464
Skewness for Males:  1.19370211812

Hypotesis test


In [26]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_acc_rate, common_males_acc_rate)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_acc_rate, common_males_acc_rate)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_acc_rate, common_males_acc_rate)[1]


Two-sample Kolmogorov-Smirnov test:  0.939367889999
Two-sample unpaired t-test:  0.411875597209
Two-sample Mann Whitney U test:  0.413618034424

Hypothesis 2: The mean utility of the answer for each user is the same between genders.

H0: meanUtilityBy(Males) = meanUtilityBy(Females);

H1: meanUtilityBy(Males) != meanUtilityBy(Females).

Data

It doesn't make sense to verify quality of something that haven't been done. So, we don't verify users who didn't post any answers.


In [27]:
females_mean_utility = females['mean_utility'].dropna()
males_mean_utility = males['mean_utility'].dropna()

The data's shape


In [28]:
show_data_shape(females_mean_utility, males_mean_utility, "expon", 30, "Mean Utility Answers")


Levene's test:  0.552537303879
Skewness for Females:  0.849859950013
Skewness for Males:  0.825663202935

Hypothesis test


In [29]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_mean_utility, males_mean_utility)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_mean_utility, males_mean_utility)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_mean_utility, males_mean_utility)[1]


Two-sample Kolmogorov-Smirnov test:  0.790025277308
Two-sample unpaired t-test:  0.524550651542
Two-sample Mann Whitney U test:  0.687477529226

Looking at the top contributors


In [30]:
top_females_mean_utility = top_females['mean_utility'].dropna()
top_males_mean_utility = top_males['mean_utility'].dropna()

The data's shape


In [31]:
show_data_shape(top_females_mean_utility, top_males_mean_utility, "expon", 30, "Mean Utility Answers")


Levene's test:  0.344568452302
Skewness for Females:  1.19070826049
Skewness for Males:  0.682611124181

Hypotesis test


In [36]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_mean_utility, top_males_mean_utility)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_mean_utility, top_males_mean_utility)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_mean_utility, top_males_mean_utility)[1]


Two-sample Kolmogorov-Smirnov test:  0.845267092916
Two-sample unpaired t-test:  0.919022513862
Two-sample Mann Whitney U test:  0.652550355829

Looking at the common contributors


In [37]:
common_females_mean_utility = common_females['mean_utility'].dropna()
common_males_mean_utility = common_males['mean_utility'].dropna()

The data's shape


In [38]:
show_data_shape(common_females_mean_utility, common_males_mean_utility, "expon", 30, "Mean Utility Answers")


Levene's test:  0.327600750414
Skewness for Females:  0.55503753211
Skewness for Males:  0.921351935568

Hypotesis test


In [39]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_mean_utility, common_males_mean_utility)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_mean_utility, common_males_mean_utility)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_mean_utility, common_males_mean_utility)[1]


Two-sample Kolmogorov-Smirnov test:  0.837714053936
Two-sample unpaired t-test:  0.460718341709
Two-sample Mann Whitney U test:  0.851591120135

Hypothesis 3: The mean of the scores of the questions made by each user doesn't change between genders.

H0: questionScoreMean(Males) = questionScoreMean(Females);

H1: questionScoreMean(Males) != questionScoreMean(Females).

Data

It doesn't make sense to verify quality of something that haven't been done. So, we don't verify users who didn't post any questions.


In [40]:
females_questions_mean = females[females['questions_total'] > 0]['questions_avg']
males_questions_mean = males[males['questions_total'] > 0]['questions_avg']

The shape of the data


In [41]:
show_data_shape(females_questions_mean, males_questions_mean, "expon", 30, "Mean Score Questions")


Levene's test:  0.148278324114
Skewness for Females:  20.5260349738
Skewness for Males:  11.6192250068

Hypothesis test


In [42]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_questions_mean, males_questions_mean)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_questions_mean, males_questions_mean)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_questions_mean, males_questions_mean)[1]


Two-sample Kolmogorov-Smirnov test:  0.0908173739591
Two-sample unpaired t-test:  0.159608306156
Two-sample Mann Whitney U test:  0.328418967307

Looking at the contributors


In [43]:
top_females_questions_mean = top_females[top_females['questions_total'] > 0]['questions_avg']
top_males_questions_mean = top_males[top_males['questions_total'] > 0]['questions_avg']

The data's shape


In [44]:
show_data_shape(top_females_questions_mean, top_males_questions_mean, "expon", 30, "Mean Score Questions")


Levene's test:  0.384910868319
Skewness for Females:  4.88830181792
Skewness for Males:  9.30506186793

Hypotesis test


In [45]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_questions_mean, top_males_questions_mean)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_questions_mean, top_males_questions_mean)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_questions_mean, top_males_questions_mean)[1]


Two-sample Kolmogorov-Smirnov test:  0.0338573627755
Two-sample unpaired t-test:  0.221597960377
Two-sample Mann Whitney U test:  0.00586207254643

Looking at the common contributors


In [46]:
common_females_questions_mean = common_females[common_females['questions_total'] > 0]['questions_avg']
common_males_questions_mean = common_males[common_males['questions_total'] > 0]['questions_avg']

The data's shape


In [47]:
show_data_shape(common_females_questions_mean, common_males_questions_mean, "expon", 30, "Mean Score Questions")


Levene's test:  0.00213338022459
Skewness for Females:  19.2346597408
Skewness for Males:  4.4053185306

Hypotesis test


In [48]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_questions_mean, common_males_questions_mean)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_questions_mean, common_males_questions_mean, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_questions_mean, common_males_questions_mean)[1]


Two-sample Kolmogorov-Smirnov test:  0.127606871149
Two-sample unpaired t-test:  0.205704064264
Two-sample Mann Whitney U test:  0.342905385484

Hypothesis 4: Reputation is the same between genders.

H0: reputation(Males) = reputation(Females);

H1: reputation(Males) != reputation(Females).


In [49]:
females_reputation = females['reputation']
males_reputation = males['reputation']

In [50]:
show_data_shape(females_reputation, males_reputation, "expon", 50, "Reputation")


Levene's test:  0.0520858308753
Skewness for Females:  19.6422145484
Skewness for Males:  21.8976253218

Hypothesis test


In [51]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_reputation, males_reputation)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_reputation, males_reputation)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_reputation, males_reputation)[1]


Two-sample Kolmogorov-Smirnov test:  7.17077135815e-15
Two-sample unpaired t-test:  0.0460230753578
Two-sample Mann Whitney U test:  8.59848057824e-07

Inicialization

Here you can find the data importing and some useful functions used for analysing the data. Please, run this first, otherwise the analysis will not work.

Importing the data from the MongoDB database and inserting into a panda dataframe for easy manipulation.


In [1]:
from __future__ import division
import pymongo, time, pylab, numpy, pandas
from scipy import stats
import matplotlib as mpl
from matplotlib import pyplot

%matplotlib inline

client = pymongo.MongoClient('localhost', 27017)

community = 'math'
stats_db = client[community].statistics

cursor = stats_db.find({'$or': [{'questions_total':{'$gt':0}}, {'answers_total':{'$gt':0}}, {'comments_total':{'$gt':0}}] },
                       {u'_id': False, u'accepted_rate': True, u'reputation': True,
                        u'questions_avg': True, u'answers_total': True, u'gender':True,
                        u'questions_total': True, u'answers_accepted_total': True,
                        u'mean_utility': True,})

df =  pandas.DataFrame(list(cursor))

males = df[df['gender']=='Male']
females = df[df['gender']=='Female']

Utility functions for ploting.


In [2]:
pyplot.rcdefaults()
mpl.style.use('ggplot')

def histogram(sample1, sample2, bins, aspect):
    fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
    axes[0].hist(list(sample1), bins)
    axes[0].set_title(aspect + " by Females - Histogram")
    axes[1].hist(list(sample2), bins)
    axes[1].set_title(aspect + " by Males - Histogram")

def pdf_plot(sample1, sample2, aspect):
    fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
#     pdf_max = 20
#     xs = numpy.linspace(0,pdf_max,200)

#     d= stats.gaussian_kde(list(sample1))
#     d.covariance_factor = lambda : .25
#     d._compute_covariance()
#     axes[0].plot(xs,d(xs))

#     d= stats.gaussian_kde(list(sample2))
#     d.covariance_factor = lambda : .25
#     d._compute_covariance()
#     axes[1].plot(xs,d(xs))
    axes[0] = sample1.plot(ax=axes[0], kind="density")
    axes[1] = sample2.plot(ax=axes[1], kind="density")
    
    axes[0].set_title(aspect + " by Females - Density")
    axes[1].set_title(aspect + " by Males - Density")

    
    
def boxplot(sample1, sample2, aspect):
    fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
    axes[0].boxplot(list(sample1))
    axes[0].set_title(aspect + " by Females - Boxplot")
    axes[1].boxplot(list(sample2))
    axes[1].set_title(aspect + " by Males - Boxplot")
    

def qq_plot(sample1, sample2, distribution, aspect):
    fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))

    pyplot.subplot(121)
    stats.probplot(list(sample1), dist=distribution, plot=pyplot)
    axes[0].set_title(aspect + " by Females - QQPlot "+ distribution)

    pyplot.subplot(122)
    stats.probplot(list(sample2), dist=distribution, plot=pyplot)
    axes[1].set_title(aspect + " by Male - QQPlot "+ distribution)

Utility functions for describing the data.


In [3]:
def describe(sample1, sample2):
    print sample1.describe()
    print "Median: ", sample1.median()
    print 
    print sample2.describe()
    print "Median: ", sample2.median()
    
def show_data_shape(sample1, sample2, dist, bins, aspect):
    pyplot.close('all')
    #histogram
    histogram(sample1, sample2, bins, aspect)

    #PDF
    pdf_plot(sample1, sample2, aspect)

    #QQPlot
    qq_plot(sample1, sample2, dist, aspect)

    #boxplot
    boxplot(sample1,sample2, aspect)
    pyplot.show()

    #Levene
    print "Levene's test: ", stats.levene(sample1, sample2)[1]
    
    #skewness
    print "Skewness for Females: ", stats.skew(sample1)
    print "Skewness for Males: ", stats.skew(sample2)

In [ ]: