Community: SuperUser

Inicialization and importing data are at the end of this notebook. For better visualization of the analysis they were placed at the bottom, but it's necessary to run them first so the analysis work as expected. Click here to go there.

Data Summary

Women


In [4]:
females.describe()


Out[4]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 788.000000 788.000000 788.000000 303.000000 788.000000 788.000000 788.000000
mean 0.165580 1.648477 5.181472 -0.021532 1.914466 2.753807 238.482234
std 0.309572 11.771632 32.435824 0.884182 9.667593 7.462421 727.567127
min 0.000000 0.000000 0.000000 -1.581139 -4.000000 0.000000 51.000000
25% 0.000000 0.000000 0.000000 -0.811107 0.000000 0.000000 101.000000
50% 0.000000 0.000000 1.000000 -0.214010 0.333333 1.000000 116.000000
75% 0.222222 1.000000 2.000000 0.707107 1.909382 3.000000 156.000000
max 1.000000 273.000000 761.000000 2.626129 236.000000 102.000000 15164.000000

In [5]:
females.median()


Out[5]:
accepted_rate               0.000000
answers_accepted_total      0.000000
answers_total               1.000000
mean_utility               -0.214010
questions_avg               0.333333
questions_total             1.000000
reputation                116.000000
dtype: float64

Men


In [6]:
males.describe()


Out[6]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 18264.000000 18264.000000 18264.000000 7859.000000 18264.000000 18264.000000 18264.000000
mean 0.179639 1.698806 5.264838 0.033028 1.571798 2.522120 267.312308
std 0.317351 16.896761 42.707281 0.869367 11.400197 6.882908 1473.342909
min 0.000000 0.000000 0.000000 -2.000000 -5.000000 0.000000 50.000000
25% 0.000000 0.000000 0.000000 -0.707107 0.000000 0.000000 101.000000
50% 0.000000 0.000000 1.000000 -0.068199 0.333333 1.000000 118.000000
75% 0.272727 1.000000 2.000000 0.712290 1.857143 2.000000 170.000000
max 1.000000 1031.000000 2717.000000 3.844818 1213.000000 249.000000 99647.000000

In [7]:
males.median()


Out[7]:
accepted_rate               0.000000
answers_accepted_total      0.000000
answers_total               1.000000
mean_utility               -0.068199
questions_avg               0.333333
questions_total             1.000000
reputation                118.000000
dtype: float64

Top contributors


In [9]:
pyplot.close('all')
histogram(females["reputation"], males["reputation"], 100, "Reputation")
histogram(females[females["reputation"]<= 1000]["reputation"], males[males["reputation"]<= 1000]["reputation"], 100, "Reputation")
histogram(females[females["reputation"]<= 400]["reputation"], males[males["reputation"]<= 400]["reputation"], 100, "Reputation")
pyplot.show()


Top Women


In [10]:
top_females = females[females["reputation"]> 400]
top_females.describe()


Out[10]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 61.000000 61.000000 61.000000 27.000000 61.000000 61.000000 61.000000
mean 0.377280 16.672131 49.967213 0.301585 7.987154 12.229508 1487.803279
std 0.241225 39.498828 107.280313 0.547925 31.902247 20.407020 2274.934327
min 0.000000 0.000000 0.000000 -0.529357 0.000000 0.000000 411.000000
25% 0.222222 2.000000 6.000000 0.000165 0.000000 0.000000 517.000000
50% 0.344828 4.000000 18.000000 0.184230 2.000000 4.000000 726.000000
75% 0.500000 15.000000 45.000000 0.616961 3.500000 16.000000 1285.000000
max 1.000000 273.000000 761.000000 1.413399 236.000000 102.000000 15164.000000

In [11]:
top_females.median()


Out[11]:
accepted_rate               0.344828
answers_accepted_total      4.000000
answers_total              18.000000
mean_utility                0.184230
questions_avg               2.000000
questions_total             4.000000
reputation                726.000000
dtype: float64

Top Men


In [12]:
top_males = males[males["reputation"]> 400]
top_males.describe()


Out[12]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 1525.000000 1525.000000 1525.000000 705.000000 1525.000000 1525.000000 1525.000000
mean 0.340457 15.854426 46.017705 0.311329 4.718238 10.862295 1690.312787
std 0.209724 56.517607 141.280692 0.699377 32.919486 19.735440 4874.398334
min 0.000000 0.000000 0.000000 -1.138071 -5.000000 0.000000 401.000000
25% 0.200000 2.000000 7.000000 -0.120996 0.000000 1.000000 511.000000
50% 0.333333 5.000000 16.000000 0.206316 2.000000 4.000000 713.000000
75% 0.444444 11.000000 38.000000 0.657500 4.000000 13.000000 1346.000000
max 1.000000 1031.000000 2717.000000 3.170435 1213.000000 249.000000 99647.000000

In [13]:
top_males.median()


Out[13]:
accepted_rate               0.333333
answers_accepted_total      5.000000
answers_total              16.000000
mean_utility                0.206316
questions_avg               2.000000
questions_total             4.000000
reputation                713.000000
dtype: float64

Common women contributors


In [14]:
common_females = females[females["reputation"] <= 400]
common_females.describe()


Out[14]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 727.000000 727.000000 727.000000 276.000000 727.000000 727.000000 727.000000
mean 0.147817 0.387895 1.423659 -0.053141 1.404928 1.958735 133.656121
std 0.308215 0.848442 2.578656 0.904928 3.720874 4.215732 63.030995
min 0.000000 0.000000 0.000000 -1.581139 -4.000000 0.000000 51.000000
25% 0.000000 0.000000 0.000000 -0.853553 0.000000 0.000000 101.000000
50% 0.000000 0.000000 1.000000 -0.325780 0.000000 1.000000 111.000000
75% 0.000000 0.000000 2.000000 0.707107 1.500000 2.000000 141.000000
max 1.000000 6.000000 24.000000 2.626129 63.000000 64.000000 400.000000

In [15]:
common_females.median()


Out[15]:
accepted_rate               0.00000
answers_accepted_total      0.00000
answers_total               1.00000
mean_utility               -0.32578
questions_avg               0.00000
questions_total             1.00000
reputation                111.00000
dtype: float64

Common men contributors


In [16]:
common_males = males[males["reputation"] <= 400]
common_males.describe()


Out[16]:
accepted_rate answers_accepted_total answers_total mean_utility questions_avg questions_total reputation
count 16739.000000 16739.000000 16739.000000 7154.000000 16739.000000 16739.000000 16739.000000
mean 0.164988 0.409164 1.552064 0.005602 1.285143 1.762292 137.670410
std 0.321421 0.869438 2.757249 0.879639 6.492368 3.051703 62.310956
min 0.000000 0.000000 0.000000 -2.000000 -4.000000 0.000000 50.000000
25% 0.000000 0.000000 0.000000 -0.721378 0.000000 0.000000 101.000000
50% 0.000000 0.000000 1.000000 -0.146447 0.000000 1.000000 115.000000
75% 0.166667 1.000000 2.000000 0.737146 1.500000 2.000000 150.000000
max 1.000000 17.000000 45.000000 3.844818 743.000000 47.000000 400.000000

In [17]:
common_males.median()


Out[17]:
accepted_rate               0.000000
answers_accepted_total      0.000000
answers_total               1.000000
mean_utility               -0.146447
questions_avg               0.000000
questions_total             1.000000
reputation                115.000000
dtype: float64

Second Question: The contributions made by both genders are perceived with the same quality by the community ?

Hypothesis 1: Both genders have the same acceptance rate.

H0: acceptanceRate(Males) = acceptanceRate(Females);

H1: acceptanceRate(Males) != acceptanceRate(Females).

Data

It doesn't make sense to verify quality of something that haven't been done. So, we don't verify users who didn't post any answers.


In [18]:
females_acc_rate = females[females['answers_total'] > 0]['accepted_rate']
males_acc_rate = males[males['answers_total'] > 0]['accepted_rate']

The data's shape


In [19]:
show_data_shape(females_acc_rate, males_acc_rate, "norm", 30, "Accepted Rate")


Levene's test:  0.859952344621
Skewness for Females:  0.962686576701
Skewness for Males:  0.984564339402

Hypothesis test


In [20]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_acc_rate, males_acc_rate)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_acc_rate, males_acc_rate)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_acc_rate, males_acc_rate)[1]


Two-sample Kolmogorov-Smirnov test:  0.999985911864
Two-sample unpaired t-test:  0.837588594046
Two-sample Mann Whitney U test:  0.902526177263

Looking at the top contributors


In [21]:
top_females_acc_rate = top_females[top_females['answers_total'] > 0]['accepted_rate']
top_males_acc_rate = top_males[top_males['answers_total'] > 0]['accepted_rate']

The data's shape


In [22]:
show_data_shape(top_females_acc_rate, top_males_acc_rate, "expon", 30, "Accepted Rate")


Levene's test:  0.421231233001
Skewness for Females:  1.04377695226
Skewness for Males:  0.88917740164

Hypothesis test


In [23]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_acc_rate, top_males_acc_rate)[1]
# print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_acc_rate, top_males_acc_rate, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_acc_rate, top_males_acc_rate)[1]


Two-sample Kolmogorov-Smirnov test:  0.517712004393
Two-sample Mann Whitney U test:  0.199668144595

Looking at the common contributors


In [24]:
common_females_acc_rate = common_females[common_females['answers_total'] > 0]['accepted_rate']
common_males_acc_rate = common_males[common_males['answers_total'] > 0]['accepted_rate']

The data's shape


In [25]:
show_data_shape(common_females_acc_rate, common_males_acc_rate, "expon", 30, "Accepted Rate")


Levene's test:  0.888485773203
Skewness for Females:  1.02395148841
Skewness for Males:  1.00897523968

Hypotesis test


In [26]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_acc_rate, common_males_acc_rate)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_acc_rate, common_males_acc_rate)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_acc_rate, common_males_acc_rate)[1]


Two-sample Kolmogorov-Smirnov test:  0.999976750102
Two-sample unpaired t-test:  0.888485773201
Two-sample Mann Whitney U test:  0.882075787725

Hypothesis 2: The mean utility of the answer for each user is the same between genders.

H0: meanUtilityBy(Males) = meanUtilityBy(Females);

H1: meanUtilityBy(Males) != meanUtilityBy(Females).

Data

It doesn't make sense to verify quality of something that haven't been done. So, we don't verify users who didn't post any answers.


In [27]:
females_mean_utility = females['mean_utility'].dropna()
males_mean_utility = males['mean_utility'].dropna()

The data's shape


In [28]:
show_data_shape(females_mean_utility, males_mean_utility, "expon", 30, "Mean Utility Answers")


Levene's test:  0.610969055971
Skewness for Females:  0.49164056925
Skewness for Males:  0.464702622392

Hypothesis test


In [30]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_mean_utility, males_mean_utility)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_mean_utility, males_mean_utility)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_mean_utility, males_mean_utility)[1]


Two-sample Kolmogorov-Smirnov test:  0.263044672617
Two-sample unpaired t-test:  0.284080186296
Two-sample Mann Whitney U test:  0.222665770297

Looking at the top contributors


In [31]:
top_females_mean_utility = top_females['mean_utility'].dropna()
top_males_mean_utility = top_males['mean_utility'].dropna()

The data's shape


In [32]:
show_data_shape(top_females_mean_utility, top_males_mean_utility, "expon", 30, "Mean Utility Answers")


Levene's test:  0.250648647929
Skewness for Females:  0.603584133533
Skewness for Males:  0.946775902152

Hypotesis test


In [33]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_mean_utility, top_males_mean_utility)[1]
# print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_mean_utility, top_males_mean_utility, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_mean_utility, top_males_mean_utility)[1]


Two-sample Kolmogorov-Smirnov test:  0.858087125685
Two-sample Mann Whitney U test:  0.930901063935

Looking at the common contributors


In [34]:
common_females_mean_utility = common_females['mean_utility'].dropna()
common_males_mean_utility = common_males['mean_utility'].dropna()

The data's shape


In [35]:
show_data_shape(common_females_mean_utility, common_males_mean_utility, "expon", 30, "Mean Utility Answers")


Levene's test:  0.669413702501
Skewness for Females:  0.554597546791
Skewness for Males:  0.477422213681

Hypotesis test


In [36]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_mean_utility, common_males_mean_utility)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_mean_utility, common_males_mean_utility)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_mean_utility, common_males_mean_utility)[1]


Two-sample Kolmogorov-Smirnov test:  0.143978331
Two-sample unpaired t-test:  0.276862026631
Two-sample Mann Whitney U test:  0.171205762895

Hypothesis 3: The mean of the scores of the questions made by each user doesn't change between genders.

H0: questionScoreMean(Males) = questionScoreMean(Females);

H1: questionScoreMean(Males) != questionScoreMean(Females).

Data

It doesn't make sense to verify quality of something that haven't been done. So, we don't verify users who didn't post any questions.


In [37]:
females_questions_mean = females[females['questions_total'] > 0]['questions_avg']
males_questions_mean = males[males['questions_total'] > 0]['questions_avg']

The shape of the data


In [38]:
show_data_shape(females_questions_mean, males_questions_mean, "expon", 30, "Mean Score Questions")


Levene's test:  0.384390776645
Skewness for Females:  15.9799781413
Skewness for Males:  66.0843131017

Hypothesis test


In [39]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_questions_mean, males_questions_mean)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_questions_mean, males_questions_mean)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_questions_mean, males_questions_mean)[1]


Two-sample Kolmogorov-Smirnov test:  0.235373755179
Two-sample unpaired t-test:  0.483082872243
Two-sample Mann Whitney U test:  0.307601673266

Looking at the contributors


In [40]:
top_females_questions_mean = top_females[top_females['questions_total'] > 0]['questions_avg']
top_males_questions_mean = top_males[top_males['questions_total'] > 0]['questions_avg']

The data's shape


In [41]:
show_data_shape(top_females_questions_mean, top_males_questions_mean, "expon", 30, "Mean Score Questions")


Levene's test:  0.436436983966
Skewness for Females:  5.47090033262
Skewness for Males:  28.9467519731

Hypotesis test


In [42]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_questions_mean, top_males_questions_mean)[1]
# print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_questions_mean, top_males_questions_mean, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_questions_mean, top_males_questions_mean)[1]


Two-sample Kolmogorov-Smirnov test:  0.553323276216
Two-sample Mann Whitney U test:  0.629538542561

Looking at the common contributors


In [43]:
common_females_questions_mean = common_females[common_females['questions_total'] > 0]['questions_avg']
common_males_questions_mean = common_males[common_males['questions_total'] > 0]['questions_avg']

The data's shape


In [44]:
show_data_shape(common_females_questions_mean, common_males_questions_mean, "expon", 30, "Mean Score Questions")


Levene's test:  0.562164833112
Skewness for Females:  7.41428196711
Skewness for Males:  73.6346541163

Hypotesis test


In [45]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_questions_mean, common_males_questions_mean)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_questions_mean, common_males_questions_mean)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_questions_mean, common_males_questions_mean)[1]


Two-sample Kolmogorov-Smirnov test:  0.193728907933
Two-sample unpaired t-test:  0.783415213503
Two-sample Mann Whitney U test:  0.268812655017

Hypothesis 4: Reputation is the same between genders.

H0: reputation(Males) = reputation(Females);

H1: reputation(Males) != reputation(Females).


In [46]:
females_reputation = females['reputation']
males_reputation = males['reputation']

In [47]:
show_data_shape(females_reputation, males_reputation, "expon", 50, "Reputation")


Levene's test:  0.618743396293
Skewness for Females:  14.0647577589
Skewness for Males:  40.8431733142

Hypothesis test


In [48]:
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_reputation, males_reputation)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_reputation, males_reputation)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_reputation, males_reputation)[1]


Two-sample Kolmogorov-Smirnov test:  0.0507162704314
Two-sample unpaired t-test:  0.584785773801
Two-sample Mann Whitney U test:  0.0171976083892

Inicialization

Here you can find the data importing and some useful functions used for analysing the data. Please, run this first, otherwise the analysis will not work.

Importing the data from the MongoDB database and inserting into a panda dataframe for easy manipulation.


In [1]:
from __future__ import division
import pymongo, time, pylab, numpy, pandas
from scipy import stats
import matplotlib as mpl
from matplotlib import pyplot

%matplotlib inline

client = pymongo.MongoClient('localhost', 27017)

community = 'superuser'
stats_db = client[community].statistics

cursor = stats_db.find({'$or': [{'questions_total':{'$gt':0}}, {'answers_total':{'$gt':0}}, {'comments_total':{'$gt':0}}] },
                       {u'_id': False, u'accepted_rate': True, u'reputation': True,
                        u'questions_avg': True, u'answers_total': True, u'gender':True,
                        u'questions_total': True, u'answers_accepted_total': True,
                        u'mean_utility': True,})

df =  pandas.DataFrame(list(cursor))

males = df[df['gender']=='Male']
females = df[df['gender']=='Female']

Utility functions for ploting.


In [2]:
pyplot.rcdefaults()
mpl.style.use('ggplot')

def histogram(sample1, sample2, bins, aspect):
    fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
    axes[0].hist(list(sample1), bins)
    axes[0].set_title(aspect + " by Females - Histogram")
    axes[1].hist(list(sample2), bins)
    axes[1].set_title(aspect + " by Males - Histogram")

def pdf_plot(sample1, sample2, aspect):
    fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
#     pdf_max = 20
#     xs = numpy.linspace(0,pdf_max,200)

#     d= stats.gaussian_kde(list(sample1))
#     d.covariance_factor = lambda : .25
#     d._compute_covariance()
#     axes[0].plot(xs,d(xs))

#     d= stats.gaussian_kde(list(sample2))
#     d.covariance_factor = lambda : .25
#     d._compute_covariance()
#     axes[1].plot(xs,d(xs))
    axes[0] = sample1.plot(ax=axes[0], kind="density")
    axes[1] = sample2.plot(ax=axes[1], kind="density")
    
    axes[0].set_title(aspect + " by Females - Density")
    axes[1].set_title(aspect + " by Males - Density")

    
    
def boxplot(sample1, sample2, aspect):
    fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
    axes[0].boxplot(list(sample1))
    axes[0].set_title(aspect + " by Females - Boxplot")
    axes[1].boxplot(list(sample2))
    axes[1].set_title(aspect + " by Males - Boxplot")
    

def qq_plot(sample1, sample2, distribution, aspect):
    fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))

    pyplot.subplot(121)
    stats.probplot(list(sample1), dist=distribution, plot=pyplot)
    axes[0].set_title(aspect + " by Females - QQPlot "+ distribution)

    pyplot.subplot(122)
    stats.probplot(list(sample2), dist=distribution, plot=pyplot)
    axes[1].set_title(aspect + " by Male - QQPlot "+ distribution)

Utility functions for describing the data.


In [3]:
def describe(sample1, sample2):
    print sample1.describe()
    print "Median: ", sample1.median()
    print 
    print sample2.describe()
    print "Median: ", sample2.median()
    
def show_data_shape(sample1, sample2, dist, bins, aspect):
    pyplot.close('all')
    #histogram
    histogram(sample1, sample2, bins, aspect)

    #PDF
    pdf_plot(sample1, sample2, aspect)

    #QQPlot
    qq_plot(sample1, sample2, dist, aspect)

    #boxplot
    boxplot(sample1,sample2, aspect)
    pyplot.show()

    #Levene
    print "Levene's test: ", stats.levene(sample1, sample2)[1]
    
    #skewness
    print "Skewness for Females: ", stats.skew(sample1)
    print "Skewness for Males: ", stats.skew(sample2)

In [ ]: