Community: StackOverflow

Inicialization and importing data are at the end of this notebook. For better visualization of the analysis they were placed at the bottom, but it's necessary to run them first so the analysis work as expected. Click here to go there.

Data Summary

Women


In [4]:
females.describe()


Out[4]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 7897.000000 7897.000000 7897.000000 4161.000000 7897.000000 7897.000000 7897.000000
mean 0.270054 7.582246 22.341775 -0.021649 2.056335 13.543624 694.373939
std 0.300061 43.464103 102.089690 0.781284 5.913016 31.424080 3526.726330
min 0.000000 0.000000 0.000000 -1.827701 -4.000000 0.000000 50.000000
25% 0.000000 0.000000 1.000000 -0.607915 0.058824 1.000000 75.000000
50% 0.200000 1.000000 4.000000 -0.160692 0.909091 5.000000 132.000000
75% 0.428571 4.000000 13.000000 0.457107 2.000000 14.000000 394.000000
max 1.000000 1445.000000 3249.000000 3.697290 196.000000 728.000000 141184.000000

In [5]:
females.median()


Out[5]:
accepted_rate               0.200000
answers_accepted_total      1.000000
answers_total               4.000000
mean_utility               -0.160692
questions_avg               0.909091
questions_total             5.000000
reputation                132.000000
dtype: float64

Men


In [6]:
males.describe()


Out[6]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 125340.000000 125340.000000 125340.000000 67917.000000 125340.000000 125340.000000 125340.000000
mean 0.281233 12.361401 34.937674 -0.002369 2.263167 11.434690 1169.622946
std 0.271936 94.287404 195.043927 0.754368 8.969468 27.684822 6567.785536
min 0.000000 0.000000 0.000000 -2.236068 -6.000000 0.000000 50.000000
25% 0.000000 0.000000 2.000000 -0.548726 0.000000 1.000000 86.000000
50% 0.250000 2.000000 6.000000 -0.130009 1.000000 4.000000 176.000000
75% 0.421053 6.000000 20.000000 0.404377 2.277778 12.000000 606.000000
max 1.000000 16730.000000 28137.000000 5.313012 1300.000000 1327.000000 640237.000000

In [7]:
males.median()


Out[7]:
accepted_rate               0.250000
answers_accepted_total      2.000000
answers_total               6.000000
mean_utility               -0.130009
questions_avg               1.000000
questions_total             4.000000
reputation                176.000000
dtype: float64

Top contributors


In [10]:
pyplot.close('all')
histogram(females["reputation"], males["reputation"], 100, "Reputation")
histogram(females[females["reputation"]<= 1000]["reputation"], males[males["reputation"]<= 1000]["reputation"], 100, "Reputation")
# histogram(females[females["reputation"]<= 500]["reputation"], males[males["reputation"]<= 500]["reputation"], 100, "Reputation")
pyplot.show()


Top Women


In [11]:
top_females = females[females["reputation"]> 1000]
top_females.describe()


Out[11]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 868.000000 868.000000 868.000000 181.000000 868.000000 868.000000 868.000000
mean 0.342600 52.654378 144.726959 0.070394 3.903852 38.675115 4651.165899
std 0.157587 121.787467 277.750081 0.619072 10.459406 77.020379 9763.712921
min 0.000000 0.000000 0.000000 -1.000000 -2.500000 0.000000 1002.000000
25% 0.244309 10.000000 32.000000 -0.373423 0.944153 3.000000 1330.750000
50% 0.333333 22.000000 70.000000 0.031842 1.988506 12.500000 2027.500000
75% 0.437363 46.000000 133.250000 0.383406 3.671875 38.000000 3837.250000
max 1.000000 1445.000000 3249.000000 3.107033 173.437500 728.000000 141184.000000

In [12]:
top_females.median()


Out[12]:
accepted_rate                0.333333
answers_accepted_total      22.000000
answers_total               70.000000
mean_utility                 0.031842
questions_avg                1.988506
questions_total             12.500000
reputation                2027.500000
dtype: float64

Top Men


In [13]:
top_males = males[males["reputation"]> 1000]
top_males.describe()


Out[13]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 21572.000000 21572.000000 21572.000000 3925.000000 21572.000000 21572.000000 21572.000000
mean 0.346288 60.458743 161.496755 0.157097 4.593401 29.144029 5655.511218
std 0.148891 220.917545 448.405582 0.613714 19.086542 56.172743 15036.391721
min 0.000000 0.000000 0.000000 -1.582612 -5.000000 0.000000 1001.000000
25% 0.250000 11.000000 38.000000 -0.198330 1.000000 3.000000 1425.000000
50% 0.333333 24.000000 73.000000 0.067474 2.246664 10.000000 2272.500000
75% 0.432692 50.000000 146.000000 0.362103 4.375000 31.000000 4612.250000
max 1.000000 16730.000000 28137.000000 5.313012 1300.000000 1327.000000 640237.000000

In [14]:
top_males.median()


Out[14]:
accepted_rate                0.333333
answers_accepted_total      24.000000
answers_total               73.000000
mean_utility                 0.067474
questions_avg                2.246664
questions_total             10.000000
reputation                2272.500000
dtype: float64

Common women contributors


In [15]:
common_females = females[females["reputation"] <= 1000]
common_females.describe()


Out[15]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 7029.000000 7029.000000 7029.000000 3980.000000 7029.000000 7029.000000 7029.000000
mean 0.261095 2.016361 7.228624 -0.025835 1.828188 10.440176 205.756011
std 0.312030 3.287563 10.690553 0.787675 5.031152 17.028885 203.109675
min 0.000000 0.000000 0.000000 -1.827701 -4.000000 0.000000 50.000000
25% 0.000000 0.000000 1.000000 -0.625221 0.000000 1.000000 71.000000
50% 0.166667 1.000000 3.000000 -0.170780 0.800000 5.000000 116.000000
75% 0.428571 3.000000 9.000000 0.463401 1.800000 13.000000 271.000000
max 1.000000 55.000000 109.000000 3.697290 196.000000 205.000000 999.000000

In [16]:
common_females.median()


Out[16]:
accepted_rate               0.166667
answers_accepted_total      1.000000
answers_total               3.000000
mean_utility               -0.170780
questions_avg               0.800000
questions_total             5.000000
reputation                116.000000
dtype: float64

Common men contributors


In [17]:
common_males = males[males["reputation"] <= 1000]
common_males.describe()


Out[17]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 103768.000000 103768.000000 103768.000000 63992.000000 103768.000000 103768.000000 103768.000000
mean 0.267709 2.362597 8.627708 -0.012149 1.778742 7.753151 237.065878
std 0.289226 3.452494 11.440576 0.761068 4.481431 13.823759 223.953725
min 0.000000 0.000000 0.000000 -2.236068 -6.000000 0.000000 50.000000
25% 0.000000 0.000000 1.000000 -0.573193 0.000000 0.000000 77.000000
50% 0.200000 1.000000 5.000000 -0.149071 0.789474 3.000000 133.000000
75% 0.411765 3.000000 11.000000 0.408880 2.000000 9.000000 340.000000
max 1.000000 48.000000 157.000000 5.053353 325.000000 519.000000 1000.000000

In [18]:
common_males.median()


Out[18]:
accepted_rate               0.200000
answers_accepted_total      1.000000
answers_total               5.000000
mean_utility               -0.149071
questions_avg               0.789474
questions_total             3.000000
reputation                133.000000
dtype: float64

Second Question: The contributions made by both genders are perceived with the same quality by the community ?

Hypothesis 1: Both genders have the same acceptance rate.

H0: acceptanceRate(Males) = acceptanceRate(Females);

H1: acceptanceRate(Males) != acceptanceRate(Females).

Data

It doesn't make sense to verify quality of something that haven't been done. So, we don't verify users who didn't post any answers.


In [19]:
females_acc_rate = females[females['answers_total'] > 0]['accepted_rate']
males_acc_rate = males[males['answers_total'] > 0]['accepted_rate']

The data's shape


In [21]:
show_data_shape(females_acc_rate, males_acc_rate, "norm", 50, "Accepted Rate")


Levene's test:  5.08482305788e-36
Skewness for Females:  0.923995860446
Skewness for Males:  0.97890009617

Hypothesis test


In [22]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_acc_rate, males_acc_rate)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_acc_rate, males_acc_rate, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_acc_rate, males_acc_rate)[1]


Two-sample Kolmogorov-Smirnov test:  9.97826700665e-09
Two-sample unpaired t-test:  0.00258356422005
Two-sample Mann Whitney U test:  0.482122703116

Looking at the top contributors


In [23]:
top_females_acc_rate = top_females[top_females['answers_total'] > 0]['accepted_rate']
top_males_acc_rate = top_males[top_males['answers_total'] > 0]['accepted_rate']

The data's shape


In [24]:
show_data_shape(top_females_acc_rate, top_males_acc_rate, "expon", 30, "Accepted Rate")


Levene's test:  0.296995754406
Skewness for Females:  0.780519551776
Skewness for Males:  0.651412780679

Hypothesis test


In [25]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_acc_rate, top_males_acc_rate)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_acc_rate, top_males_acc_rate)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_acc_rate, top_males_acc_rate)[1]


Two-sample Kolmogorov-Smirnov test:  0.937652401574
Two-sample unpaired t-test:  0.932905756436
Two-sample Mann Whitney U test:  0.811851432448

Looking at the common contributors


In [26]:
common_females_acc_rate = common_females[common_females['answers_total'] > 0]['accepted_rate']
common_males_acc_rate = common_males[common_males['answers_total'] > 0]['accepted_rate']

The data's shape


In [27]:
show_data_shape(common_females_acc_rate, common_males_acc_rate, "expon", 30, "Accepted Rate")


Levene's test:  6.83303385742e-23
Skewness for Females:  0.928582303983
Skewness for Males:  1.02388746299

Hypotesis test


In [28]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_acc_rate, common_males_acc_rate)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_acc_rate, common_males_acc_rate, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_acc_rate, common_males_acc_rate)[1]


Two-sample Kolmogorov-Smirnov test:  2.06314353468e-07
Two-sample unpaired t-test:  9.96882818633e-05
Two-sample Mann Whitney U test:  0.326055127477

Hypothesis 2: The mean utility of the answer for each user is the same between genders.

H0: meanUtilityBy(Males) = meanUtilityBy(Females);

H1: meanUtilityBy(Males) != meanUtilityBy(Females).

Data

It doesn't make sense to verify quality of something that haven't been done. So, we don't verify users who didn't post any answers.


In [29]:
females_mean_utility = females['mean_utility'].dropna()
males_mean_utility = males['mean_utility'].dropna()

The data's shape


In [30]:
show_data_shape(females_mean_utility, males_mean_utility, "expon", 30, "Mean Utility Answers")


Levene's test:  9.09599844459e-05
Skewness for Females:  0.762636587448
Skewness for Males:  0.835495912062

Hypothesis test


In [31]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_mean_utility, males_mean_utility)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_mean_utility, males_mean_utility, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_mean_utility, males_mean_utility)[1]


Two-sample Kolmogorov-Smirnov test:  0.000119450412226
Two-sample unpaired t-test:  0.121620730324
Two-sample Mann Whitney U test:  0.00814208803127

Looking at the top contributors


In [32]:
top_females_mean_utility = top_females['mean_utility'].dropna()
top_males_mean_utility = top_males['mean_utility'].dropna()

The data's shape


In [33]:
show_data_shape(top_females_mean_utility, top_males_mean_utility, "expon", 30, "Mean Utility Answers")


Levene's test:  0.161764679201
Skewness for Females:  1.49352955153
Skewness for Males:  2.20029630132

Hypotesis test


In [34]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_mean_utility, top_males_mean_utility)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_mean_utility, top_males_mean_utility)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_mean_utility, top_males_mean_utility)[1]


Two-sample Kolmogorov-Smirnov test:  0.000473784166564
Two-sample unpaired t-test:  0.0632994603903
Two-sample Mann Whitney U test:  0.0378178001369

Looking at the common contributors


In [35]:
common_females_mean_utility = common_females['mean_utility'].dropna()
common_males_mean_utility = common_males['mean_utility'].dropna()

The data's shape


In [36]:
show_data_shape(common_females_mean_utility, common_males_mean_utility, "expon", 30, "Mean Utility Answers")


Levene's test:  0.000428144054351
Skewness for Females:  0.751442870121
Skewness for Males:  0.805717651822

Hypotesis test


In [37]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_mean_utility, common_males_mean_utility)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_mean_utility, common_males_mean_utility, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_mean_utility, common_males_mean_utility)[1]


Two-sample Kolmogorov-Smirnov test:  0.00148400371667
Two-sample unpaired t-test:  0.286649052496
Two-sample Mann Whitney U test:  0.043008849874

Hypothesis 3: The mean of the scores of the questions made by each user doesn't change between genders.

H0: questionScoreMean(Males) = questionScoreMean(Females);

H1: questionScoreMean(Males) != questionScoreMean(Females).

Data

It doesn't make sense to verify quality of something that haven't been done. So, we don't verify users who didn't post any questions.


In [38]:
females_questions_mean = females[females['questions_total'] > 0]['questions_avg']
males_questions_mean = males[males['questions_total'] > 0]['questions_avg']

The shape of the data


In [39]:
show_data_shape(females_questions_mean, males_questions_mean, "expon", 30, "Mean Score Questions")


Levene's test:  0.0103241068597
Skewness for Females:  14.2790070694
Skewness for Males:  54.6242657409

Hypothesis test


In [40]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_questions_mean, males_questions_mean)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_questions_mean, males_questions_mean, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_questions_mean, males_questions_mean)[1]


Two-sample Kolmogorov-Smirnov test:  3.44842321018e-36
Two-sample unpaired t-test:  4.22941300745e-07
Two-sample Mann Whitney U test:  1.5131558872e-36

Looking at the contributors


In [41]:
top_females_questions_mean = top_females[top_females['questions_total'] > 0]['questions_avg']
top_males_questions_mean = top_males[top_males['questions_total'] > 0]['questions_avg']

The data's shape


In [42]:
show_data_shape(top_females_questions_mean, top_males_questions_mean, "expon", 30, "Mean Score Questions")


Levene's test:  0.344968587838
Skewness for Females:  10.3789676902
Skewness for Males:  33.6598705553

Hypotesis test


In [43]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_questions_mean, top_males_questions_mean)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_questions_mean, top_males_questions_mean)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_questions_mean, top_males_questions_mean)[1]


Two-sample Kolmogorov-Smirnov test:  1.37202186703e-07
Two-sample unpaired t-test:  0.208913893049
Two-sample Mann Whitney U test:  1.56635411444e-07

Looking at the common contributors


In [44]:
common_females_questions_mean = common_females[common_females['questions_total'] > 0]['questions_avg']
common_males_questions_mean = common_males[common_males['questions_total'] > 0]['questions_avg']

The data's shape


In [45]:
show_data_shape(common_females_questions_mean, common_males_questions_mean, "expon", 30, "Mean Score Questions")


Levene's test:  0.28003070781
Skewness for Females:  14.5549635989
Skewness for Males:  13.2888573576

Hypotesis test


In [46]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_questions_mean, common_males_questions_mean)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_questions_mean, common_males_questions_mean)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_questions_mean, common_males_questions_mean)[1]


Two-sample Kolmogorov-Smirnov test:  3.68236021391e-17
Two-sample unpaired t-test:  0.0604102276128
Two-sample Mann Whitney U test:  2.43897237729e-14

Hypothesis 4: Reputation is the same between genders.

H0: reputation(Males) = reputation(Females);

H1: reputation(Males) != reputation(Females).


In [47]:
females_reputation = females['reputation']
males_reputation = males['reputation']

In [48]:
show_data_shape(females_reputation, males_reputation, "expon", 50, "Reputation")


Levene's test:  5.7427856712e-10
Skewness for Females:  20.5032200007
Skewness for Males:  31.7655218713

Hypothesis test


In [49]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_reputation, males_reputation)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_reputation, males_reputation, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_reputation, males_reputation)[1]


Two-sample Kolmogorov-Smirnov test:  3.78635601452e-61
Two-sample unpaired t-test:  2.73694656006e-27
Two-sample Mann Whitney U test:  8.98048646755e-81

Inicialization

Here you can find the data importing and some useful functions used for analysing the data. Please, run this first, otherwise the analysis will not work.

Importing the data from the MongoDB database and inserting into a panda dataframe for easy manipulation.


In [1]:
from __future__ import division
import pymongo, time, pylab, numpy, pandas
from scipy import stats
import matplotlib as mpl
from matplotlib import pyplot

%matplotlib inline

client = pymongo.MongoClient('localhost', 27017)

community = 'stackoverflow'
stats_db = client[community].statistics

cursor = stats_db.find({'$or': [{'questions_total':{'$gt':0}}, {'answers_total':{'$gt':0}}, {'comments_total':{'$gt':0}}] },
                       {u'_id': False, u'accepted_rate': True, u'reputation': True,
                        u'questions_avg': True, u'answers_total': True, u'gender':True,
                        u'questions_total': True, u'answers_accepted_total': True,
                        u'mean_utility': True,})

df =  pandas.DataFrame(list(cursor))

males = df[df['gender']=='Male']
females = df[df['gender']=='Female']

Utility functions for ploting.


In [2]:
pyplot.rcdefaults()
mpl.style.use('ggplot')

def histogram(sample1, sample2, bins, aspect):
    fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
    axes[0].hist(list(sample1), bins)
    axes[0].set_title(aspect + " by Females - Histogram")
    axes[1].hist(list(sample2), bins)
    axes[1].set_title(aspect + " by Males - Histogram")

def pdf_plot(sample1, sample2, aspect):
    fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
#     pdf_max = 20
#     xs = numpy.linspace(0,pdf_max,200)

#     d= stats.gaussian_kde(list(sample1))
#     d.covariance_factor = lambda : .25
#     d._compute_covariance()
#     axes[0].plot(xs,d(xs))

#     d= stats.gaussian_kde(list(sample2))
#     d.covariance_factor = lambda : .25
#     d._compute_covariance()
#     axes[1].plot(xs,d(xs))
    axes[0] = sample1.plot(ax=axes[0], kind="density")
    axes[1] = sample2.plot(ax=axes[1], kind="density")
    
    axes[0].set_title(aspect + " by Females - Density")
    axes[1].set_title(aspect + " by Males - Density")

    
    
def boxplot(sample1, sample2, aspect):
    fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
    axes[0].boxplot(list(sample1))
    axes[0].set_title(aspect + " by Females - Boxplot")
    axes[1].boxplot(list(sample2))
    axes[1].set_title(aspect + " by Males - Boxplot")
    

def qq_plot(sample1, sample2, distribution, aspect):
    fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))

    pyplot.subplot(121)
    stats.probplot(list(sample1), dist=distribution, plot=pyplot)
    axes[0].set_title(aspect + " by Females - QQPlot "+ distribution)

    pyplot.subplot(122)
    stats.probplot(list(sample2), dist=distribution, plot=pyplot)
    axes[1].set_title(aspect + " by Male - QQPlot "+ distribution)

Utility functions for describing the data.


In [3]:
def describe(sample1, sample2):
    print sample1.describe()
    print "Median: ", sample1.median()
    print 
    print sample2.describe()
    print "Median: ", sample2.median()
    
def show_data_shape(sample1, sample2, dist, bins, aspect):
    pyplot.close('all')
    #histogram
    histogram(sample1, sample2, bins, aspect)

    #PDF
    pdf_plot(sample1, sample2, aspect)

    #QQPlot
    qq_plot(sample1, sample2, dist, aspect)

    #boxplot
    boxplot(sample1,sample2, aspect)
    pyplot.show()

    #Levene
    print "Levene's test: ", stats.levene(sample1, sample2)[1]
    
    #skewness
    print "Skewness for Females: ", stats.skew(sample1)
    print "Skewness for Males: ", stats.skew(sample2)

In [ ]: