Community: Programmers

Inicialization and importing data are at the end of this notebook. For better visualization of the analysis they were placed at the bottom, but it's necessary to run them first so the analysis work as expected. Click here to go there.

Data Summary

Women


In [5]:
females.describe()


Out[5]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 383.000000 383.000000 383.000000 182.000000 383.000000 383.000000 383.000000
mean 0.066655 0.819843 5.007833 -0.136622 4.998534 1.402089 472.519582
std 0.199117 4.582738 19.049466 0.786598 12.313895 3.761919 1833.622104
min 0.000000 0.000000 0.000000 -1.333772 -3.000000 0.000000 51.000000
25% 0.000000 0.000000 0.000000 -0.621652 0.000000 0.000000 101.000000
50% 0.000000 0.000000 1.000000 -0.330430 0.000000 1.000000 127.000000
75% 0.000000 0.000000 2.000000 0.166927 5.000000 1.000000 213.500000
max 1.000000 62.000000 173.000000 4.401387 107.000000 42.000000 24391.000000

In [6]:
females.median()


Out[6]:
accepted_rate               0.00000
answers_accepted_total      0.00000
answers_total               1.00000
mean_utility               -0.33043
questions_avg               0.00000
questions_total             1.00000
reputation                127.00000
dtype: float64

Men


In [7]:
males.describe()


Out[7]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 8731.000000 8731.000000 8731.000000 4973.000000 8731.000000 8731.000000 8731.000000
mean 0.081216 0.852022 5.113847 -0.055142 3.697275 0.931394 421.304776
std 0.219704 6.046695 25.712635 0.809089 16.506019 2.419752 1759.993363
min 0.000000 0.000000 0.000000 -1.808970 -5.000000 0.000000 50.000000
25% 0.000000 0.000000 0.000000 -0.573311 0.000000 0.000000 103.000000
50% 0.000000 0.000000 1.000000 -0.274029 0.000000 0.000000 133.000000
75% 0.000000 0.000000 2.000000 0.281594 3.708333 1.000000 239.000000
max 1.000000 231.000000 796.000000 9.653682 1100.000000 76.000000 62314.000000

In [8]:
males.median()


Out[8]:
accepted_rate               0.000000
answers_accepted_total      0.000000
answers_total               1.000000
mean_utility               -0.274029
questions_avg               0.000000
questions_total             0.000000
reputation                133.000000
dtype: float64

Top contributors


In [9]:
pyplot.close('all')
histogram(females["reputation"], males["reputation"], 100, "Reputation")
histogram(females[females["reputation"]<= 1000]["reputation"], males[males["reputation"]<= 1000]["reputation"], 100, "Reputation")
histogram(females[females["reputation"]<= 450]["reputation"], males[males["reputation"]<= 450]["reputation"], 100, "Reputation")
pyplot.show()


Top Women


In [10]:
top_females = females[females["reputation"]> 450]
top_females.describe()


Out[10]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 46.000000 46.000000 46.000000 30.000000 46.000000 46.000000 46.000000
mean 0.137440 6.021739 33.934783 0.187713 12.613522 5.717391 2842.021739
std 0.126228 12.082106 45.645690 0.912327 20.468899 9.427296 4687.361966
min 0.000000 0.000000 0.000000 -0.694013 0.000000 0.000000 454.000000
25% 0.024621 1.000000 6.500000 -0.327317 0.000000 0.000000 685.000000
50% 0.100847 2.000000 16.000000 0.037226 7.785714 1.500000 1223.000000
75% 0.218690 3.750000 32.250000 0.339956 14.612069 7.000000 2505.250000
max 0.500000 62.000000 173.000000 4.401387 107.000000 42.000000 24391.000000

In [11]:
top_females.median()


Out[11]:
accepted_rate                0.100847
answers_accepted_total       2.000000
answers_total               16.000000
mean_utility                 0.037226
questions_avg                7.785714
questions_total              1.500000
reputation                1223.000000
dtype: float64

Top Men


In [12]:
top_males = males[males["reputation"]> 450]
top_males.describe()


Out[12]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 1113.000000 1113.000000 1113.000000 816.000000 1113.000000 1113.000000 1113.000000
mean 0.177299 5.757412 31.483378 0.325455 9.892585 2.809524 2234.580413
std 0.200537 16.070229 66.073985 0.893799 38.515613 5.647596 4528.080814
min 0.000000 0.000000 0.000000 -1.386750 -2.000000 0.000000 451.000000
25% 0.000000 0.000000 5.000000 -0.176646 0.000000 0.000000 606.000000
50% 0.133333 2.000000 12.000000 0.140681 3.000000 1.000000 910.000000
75% 0.250000 5.000000 28.000000 0.564684 10.200000 3.000000 1815.000000
max 1.000000 231.000000 796.000000 9.653682 1100.000000 76.000000 62314.000000

In [13]:
top_males.median()


Out[13]:
accepted_rate               0.133333
answers_accepted_total      2.000000
answers_total              12.000000
mean_utility                0.140681
questions_avg               3.000000
questions_total             1.000000
reputation                910.000000
dtype: float64

Common women contributors


In [14]:
common_females = females[females["reputation"] <= 450]
common_females.describe()


Out[14]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 337.000000 337.000000 337.000000 152.000000 337.00000 337.000000 337.000000
mean 0.056994 0.109792 1.059347 -0.200636 3.95910 0.813056 149.086053
std 0.205333 0.340420 1.823142 0.746181 10.35644 1.135497 79.347610
min 0.000000 0.000000 0.000000 -1.333772 -3.00000 0.000000 51.000000
25% 0.000000 0.000000 0.000000 -0.663396 0.00000 0.000000 101.000000
50% 0.000000 0.000000 0.000000 -0.397628 0.00000 1.000000 121.000000
75% 0.000000 0.000000 1.000000 0.082377 4.00000 1.000000 159.000000
max 1.000000 2.000000 15.000000 4.063193 101.00000 6.000000 450.000000

In [15]:
common_females.median()


Out[15]:
accepted_rate               0.000000
answers_accepted_total      0.000000
answers_total               0.000000
mean_utility               -0.397628
questions_avg               0.000000
questions_total             1.000000
reputation                121.000000
dtype: float64

Common men contributors


In [16]:
common_males = males[males["reputation"] <= 450]
common_males.describe()


Out[16]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 7618.000000 7618.000000 7618.000000 4157.000000 7618.000000 7618.000000 7618.000000
mean 0.067178 0.135337 1.261223 -0.129852 2.792132 0.656997 156.382778
std 0.218875 0.416205 1.984983 0.769721 9.447919 1.209820 79.189708
min 0.000000 0.000000 0.000000 -1.808970 -5.000000 0.000000 50.000000
25% 0.000000 0.000000 0.000000 -0.615233 0.000000 0.000000 101.000000
50% 0.000000 0.000000 1.000000 -0.353553 0.000000 0.000000 123.000000
75% 0.000000 0.000000 1.000000 0.165700 3.000000 1.000000 178.000000
max 1.000000 5.000000 23.000000 7.413026 331.000000 26.000000 449.000000

In [17]:
common_males.median()


Out[17]:
accepted_rate               0.000000
answers_accepted_total      0.000000
answers_total               1.000000
mean_utility               -0.353553
questions_avg               0.000000
questions_total             0.000000
reputation                123.000000
dtype: float64

Second Question: The contributions made by both genders are perceived with the same quality by the community ?

Hypothesis 1: Both genders have the same acceptance rate.

H0: acceptanceRate(Males) = acceptanceRate(Females);

H1: acceptanceRate(Males) != acceptanceRate(Females).

Data

It doesn't make sense to verify quality of something that haven't been done. So, we don't verify users who didn't post any answers.


In [18]:
females_acc_rate = females[females['answers_total'] > 0]['accepted_rate']
males_acc_rate = males[males['answers_total'] > 0]['accepted_rate']

The data's shape


In [28]:
show_data_shape(females_acc_rate, males_acc_rate, "norm", 30, "Accepted Rate")


Levene's test:  0.728592261497
Skewness for Females:  2.56628745179
Skewness for Males:  2.36882125452

Hypothesis test


In [29]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_acc_rate, males_acc_rate)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_acc_rate, males_acc_rate)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_acc_rate, males_acc_rate)[1]


Two-sample Kolmogorov-Smirnov test:  0.99843735701
Two-sample unpaired t-test:  0.728592261498
Two-sample Mann Whitney U test:  0.761297791248

Looking at the top contributors


In [30]:
top_females_acc_rate = top_females[top_females['answers_total'] > 0]['accepted_rate']
top_males_acc_rate = top_males[top_males['answers_total'] > 0]['accepted_rate']

The data's shape


In [31]:
show_data_shape(top_females_acc_rate, top_males_acc_rate, "expon", 30, "Accepted Rate")


Levene's test:  0.139907387493
Skewness for Females:  0.724413426319
Skewness for Males:  2.05749012295

Hypothesis test


In [57]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_acc_rate, top_males_acc_rate)[1]
# print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_acc_rate, top_males_acc_rate, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_acc_rate, top_males_acc_rate)[1]


Two-sample Kolmogorov-Smirnov test:  0.721167823567
Two-sample Mann Whitney U test:  0.688345914725

Looking at the common contributors


In [33]:
common_females_acc_rate = common_females[common_females['answers_total'] > 0]['accepted_rate']
common_males_acc_rate = common_males[common_males['answers_total'] > 0]['accepted_rate']

The data's shape


In [34]:
show_data_shape(common_females_acc_rate, common_males_acc_rate, "expon", 30, "Accepted Rate")


Levene's test:  0.998758098787
Skewness for Females:  2.51308705037
Skewness for Males:  2.45809441702

Hypotesis test


In [61]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_acc_rate, common_males_acc_rate)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_acc_rate, common_males_acc_rate)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_acc_rate, common_males_acc_rate)[1]


Two-sample Kolmogorov-Smirnov test:  1.0
Two-sample unpaired t-test:  0.99875809878
Two-sample Mann Whitney U test:  0.881386829017

Hypothesis 2: The mean utility of the answer for each user is the same between genders.

H0: meanUtilityBy(Males) = meanUtilityBy(Females);

H1: meanUtilityBy(Males) != meanUtilityBy(Females).

Data

It doesn't make sense to verify quality of something that haven't been done. So, we don't verify users who didn't post any answers.


In [35]:
females_mean_utility = females['mean_utility'].dropna()
males_mean_utility = males['mean_utility'].dropna()

The data's shape


In [36]:
show_data_shape(females_mean_utility, males_mean_utility, "expon", 30, "Mean Utility Answers")


Levene's test:  0.393929417368
Skewness for Females:  2.36391634488
Skewness for Males:  2.07820309422

Hypothesis test


In [64]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_mean_utility, males_mean_utility)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_mean_utility, males_mean_utility)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_mean_utility, males_mean_utility)[1]


Two-sample Kolmogorov-Smirnov test:  0.199388168274
Two-sample unpaired t-test:  0.181711166699
Two-sample Mann Whitney U test:  0.121796616593

Looking at the top contributors


In [39]:
top_females_mean_utility = top_females['mean_utility'].dropna()
top_males_mean_utility = top_males['mean_utility'].dropna()

The data's shape


In [40]:
show_data_shape(top_females_mean_utility, top_males_mean_utility, "expon", 30, "Mean Utility Answers")


Levene's test:  0.914433866537
Skewness for Females:  3.3721538152
Skewness for Males:  4.1265993826

Hypotesis test


In [67]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_mean_utility, top_males_mean_utility)[1]
# print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_mean_utility, top_males_mean_utility, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_mean_utility, top_males_mean_utility)[1]


Two-sample Kolmogorov-Smirnov test:  0.243691134256
Two-sample Mann Whitney U test:  0.151675715549

Looking at the common contributors


In [41]:
common_females_mean_utility = common_females['mean_utility'].dropna()
common_males_mean_utility = common_males['mean_utility'].dropna()

The data's shape


In [42]:
show_data_shape(common_females_mean_utility, common_males_mean_utility, "expon", 30, "Mean Utility Answers")


Levene's test:  0.540587665681
Skewness for Females:  2.0204157619
Skewness for Males:  1.49730987513

Hypotesis test


In [43]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_mean_utility, common_males_mean_utility)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_mean_utility, common_males_mean_utility)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_mean_utility, common_males_mean_utility)[1]


Two-sample Kolmogorov-Smirnov test:  0.162034229676
Two-sample unpaired t-test:  0.265011417789
Two-sample Mann Whitney U test:  0.215199989705

Hypothesis 3: The mean of the scores of the questions made by each user doesn't change between genders.

H0: questionScoreMean(Males) = questionScoreMean(Females);

H1: questionScoreMean(Males) != questionScoreMean(Females).

Data

It doesn't make sense to verify quality of something that haven't been done. So, we don't verify users who didn't post any questions.


In [44]:
females_questions_mean = females[females['questions_total'] > 0]['questions_avg']
males_questions_mean = males[males['questions_total'] > 0]['questions_avg']

The shape of the data


In [45]:
show_data_shape(females_questions_mean, males_questions_mean, "expon", 30, "Mean Score Questions")


Levene's test:  0.59464892137
Skewness for Females:  3.8414267351
Skewness for Males:  26.6955300255

Hypothesis test


In [46]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_questions_mean, males_questions_mean)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_questions_mean, males_questions_mean)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_questions_mean, males_questions_mean)[1]


Two-sample Kolmogorov-Smirnov test:  0.535352764773
Two-sample unpaired t-test:  0.716499112038
Two-sample Mann Whitney U test:  0.891724322274

Looking at the contributors


In [47]:
top_females_questions_mean = top_females[top_females['questions_total'] > 0]['questions_avg']
top_males_questions_mean = top_males[top_males['questions_total'] > 0]['questions_avg']

The data's shape


In [48]:
show_data_shape(top_females_questions_mean, top_males_questions_mean, "expon", 30, "Mean Score Questions")


Levene's test:  0.95770294747
Skewness for Females:  2.84914923928
Skewness for Males:  17.0412063252

Hypotesis test


In [49]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_questions_mean, top_males_questions_mean)[1]
# print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_questions_mean, top_males_questions_mean, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_questions_mean, top_males_questions_mean)[1]


Two-sample Kolmogorov-Smirnov test:  0.303993271641
Two-sample Mann Whitney U test:  0.124785533252

Looking at the common contributors


In [50]:
common_females_questions_mean = common_females[common_females['questions_total'] > 0]['questions_avg']
common_males_questions_mean = common_males[common_males['questions_total'] > 0]['questions_avg']

The data's shape


In [51]:
show_data_shape(common_females_questions_mean, common_males_questions_mean, "expon", 30, "Mean Score Questions")


Levene's test:  0.305260370133
Skewness for Females:  4.09606543196
Skewness for Males:  10.9504884314

Hypotesis test


In [52]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_questions_mean, common_males_questions_mean)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_questions_mean, common_males_questions_mean)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_questions_mean, common_males_questions_mean)[1]


Two-sample Kolmogorov-Smirnov test:  0.674317041232
Two-sample unpaired t-test:  0.49163797857
Two-sample Mann Whitney U test:  0.794475444927

Hypothesis 4: Reputation is the same between genders.

H0: reputation(Males) = reputation(Females);

H1: reputation(Males) != reputation(Females).


In [53]:
females_reputation = females['reputation']
males_reputation = males['reputation']

In [54]:
show_data_shape(females_reputation, males_reputation, "expon", 50, "Reputation")


Levene's test:  0.552625372729
Skewness for Females:  9.51646785135
Skewness for Males:  16.8041524801

Hypothesis test


In [55]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_reputation, males_reputation)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_reputation, males_reputation)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_reputation, males_reputation)[1]


Two-sample Kolmogorov-Smirnov test:  0.00321579664365
Two-sample unpaired t-test:  0.577952825719
Two-sample Mann Whitney U test:  0.0663889795178

Inicialization

Here you can find the data importing and some useful functions used for analysing the data. Please, run this first, otherwise the analysis will not work.

Importing the data from the MongoDB database and inserting into a panda dataframe for easy manipulation.


In [2]:
from __future__ import division
import pymongo, time, pylab, numpy, pandas
from scipy import stats
import matplotlib as mpl
from matplotlib import pyplot

%matplotlib inline

client = pymongo.MongoClient('localhost', 27017)

community = 'programmers'
stats_db = client[community].statistics

cursor = stats_db.find({'$or': [{'questions_total':{'$gt':0}}, {'answers_total':{'$gt':0}}, {'comments_total':{'$gt':0}}] },
                       {u'_id': False, u'accepted_rate': True, u'reputation': True,
                        u'questions_avg': True, u'answers_total': True, u'gender':True,
                        u'questions_total': True, u'answers_accepted_total': True,
                        u'mean_utility': True,})

df =  pandas.DataFrame(list(cursor))

males = df[df['gender']=='Male']
females = df[df['gender']=='Female']

Utility functions for ploting.


In [27]:
pyplot.rcdefaults()
mpl.style.use('ggplot')

def histogram(sample1, sample2, bins, aspect):
    fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
    axes[0].hist(list(sample1), bins)
    axes[0].set_title(aspect + " by Females - Histogram")
    axes[1].hist(list(sample2), bins)
    axes[1].set_title(aspect + " by Males - Histogram")

def pdf_plot(sample1, sample2, aspect):
    fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
#     pdf_max = 20
#     xs = numpy.linspace(0,pdf_max,200)

#     d= stats.gaussian_kde(list(sample1))
#     d.covariance_factor = lambda : .25
#     d._compute_covariance()
#     axes[0].plot(xs,d(xs))

#     d= stats.gaussian_kde(list(sample2))
#     d.covariance_factor = lambda : .25
#     d._compute_covariance()
#     axes[1].plot(xs,d(xs))
    axes[0] = sample1.plot(ax=axes[0], kind="density")
    axes[1] = sample2.plot(ax=axes[1], kind="density")
    
    axes[0].set_title(aspect + " by Females - Density")
    axes[1].set_title(aspect + " by Males - Density")

    
    
def boxplot(sample1, sample2, aspect):
    fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
    axes[0].boxplot(list(sample1))
    axes[0].set_title(aspect + " by Females - Boxplot")
    axes[1].boxplot(list(sample2))
    axes[1].set_title(aspect + " by Males - Boxplot")
    

def qq_plot(sample1, sample2, distribution, aspect):
    fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))

    pyplot.subplot(121)
    stats.probplot(list(sample1), dist=distribution, plot=pyplot)
    axes[0].set_title(aspect + " by Females - QQPlot "+ distribution)

    pyplot.subplot(122)
    stats.probplot(list(sample2), dist=distribution, plot=pyplot)
    axes[1].set_title(aspect + " by Male - QQPlot "+ distribution)

Utility functions for describing the data.


In [4]:
def describe(sample1, sample2):
    print sample1.describe()
    print "Median: ", sample1.median()
    print 
    print sample2.describe()
    print "Median: ", sample2.median()
    
def show_data_shape(sample1, sample2, dist, bins, aspect):
    pyplot.close('all')
    #histogram
    histogram(sample1, sample2, bins, aspect)

    #PDF
    pdf_plot(sample1, sample2, aspect)

    #QQPlot
    qq_plot(sample1, sample2, dist, aspect)

    #boxplot
    boxplot(sample1,sample2, aspect)
    pyplot.show()

    #Levene
    print "Levene's test: ", stats.levene(sample1, sample2)[1]
    
    #skewness
    print "Skewness for Females: ", stats.skew(sample1)
    print "Skewness for Males: ", stats.skew(sample2)

In [ ]: