Community: StackOverflow

Inicialization and importing data are at the end of this notebook. For better visualization of the analysis they were placed at the bottom, but it's necessary to run them first so the analysis work as expected. Click here to go there.

Data Summary

Women



In [4]:

    
females.describe()









    Out[4]:






  
    
      
      accepted_rate
      answers_accepted_total
      answers_total
      mean_utility
      questions_avg
      questions_total
      reputation
    
  
  
    
      count
       7897.000000
       7897.000000
       7897.000000
       4161.000000
       7897.000000
       7897.000000
         7897.000000
    
    
      mean
          0.270054
          7.582246
         22.341775
         -0.021649
          2.056335
         13.543624
          694.373939
    
    
      std
          0.300061
         43.464103
        102.089690
          0.781284
          5.913016
         31.424080
         3526.726330
    
    
      min
          0.000000
          0.000000
          0.000000
         -1.827701
         -4.000000
          0.000000
           50.000000
    
    
      25%
          0.000000
          0.000000
          1.000000
         -0.607915
          0.058824
          1.000000
           75.000000
    
    
      50%
          0.200000
          1.000000
          4.000000
         -0.160692
          0.909091
          5.000000
          132.000000
    
    
      75%
          0.428571
          4.000000
         13.000000
          0.457107
          2.000000
         14.000000
          394.000000
    
    
      max
          1.000000
       1445.000000
       3249.000000
          3.697290
        196.000000
        728.000000
       141184.000000



In [5]:

    
females.median()









    Out[5]:





accepted_rate               0.200000
answers_accepted_total      1.000000
answers_total               4.000000
mean_utility               -0.160692
questions_avg               0.909091
questions_total             5.000000
reputation                132.000000
dtype: float64

Men



In [6]:

    
males.describe()









    Out[6]:






  
    
      
      accepted_rate
      answers_accepted_total
      answers_total
      mean_utility
      questions_avg
      questions_total
      reputation
    
  
  
    
      count
       125340.000000
       125340.000000
       125340.000000
       67917.000000
       125340.000000
       125340.000000
       125340.000000
    
    
      mean
            0.281233
           12.361401
           34.937674
          -0.002369
            2.263167
           11.434690
         1169.622946
    
    
      std
            0.271936
           94.287404
          195.043927
           0.754368
            8.969468
           27.684822
         6567.785536
    
    
      min
            0.000000
            0.000000
            0.000000
          -2.236068
           -6.000000
            0.000000
           50.000000
    
    
      25%
            0.000000
            0.000000
            2.000000
          -0.548726
            0.000000
            1.000000
           86.000000
    
    
      50%
            0.250000
            2.000000
            6.000000
          -0.130009
            1.000000
            4.000000
          176.000000
    
    
      75%
            0.421053
            6.000000
           20.000000
           0.404377
            2.277778
           12.000000
          606.000000
    
    
      max
            1.000000
        16730.000000
        28137.000000
           5.313012
         1300.000000
         1327.000000
       640237.000000



In [7]:

    
males.median()









    Out[7]:





accepted_rate               0.250000
answers_accepted_total      2.000000
answers_total               6.000000
mean_utility               -0.130009
questions_avg               1.000000
questions_total             4.000000
reputation                176.000000
dtype: float64

Top contributors



In [10]:

    
pyplot.close('all')
histogram(females["reputation"], males["reputation"], 100, "Reputation")
histogram(females[females["reputation"]<= 1000]["reputation"], males[males["reputation"]<= 1000]["reputation"], 100, "Reputation")
# histogram(females[females["reputation"]<= 500]["reputation"], males[males["reputation"]<= 500]["reputation"], 100, "Reputation")
pyplot.show()

Top Women



In [11]:

    
top_females = females[females["reputation"]> 1000]
top_females.describe()









    Out[11]:






  
    
      
      accepted_rate
      answers_accepted_total
      answers_total
      mean_utility
      questions_avg
      questions_total
      reputation
    
  
  
    
      count
       868.000000
        868.000000
        868.000000
       181.000000
       868.000000
       868.000000
          868.000000
    
    
      mean
         0.342600
         52.654378
        144.726959
         0.070394
         3.903852
        38.675115
         4651.165899
    
    
      std
         0.157587
        121.787467
        277.750081
         0.619072
        10.459406
        77.020379
         9763.712921
    
    
      min
         0.000000
          0.000000
          0.000000
        -1.000000
        -2.500000
         0.000000
         1002.000000
    
    
      25%
         0.244309
         10.000000
         32.000000
        -0.373423
         0.944153
         3.000000
         1330.750000
    
    
      50%
         0.333333
         22.000000
         70.000000
         0.031842
         1.988506
        12.500000
         2027.500000
    
    
      75%
         0.437363
         46.000000
        133.250000
         0.383406
         3.671875
        38.000000
         3837.250000
    
    
      max
         1.000000
       1445.000000
       3249.000000
         3.107033
       173.437500
       728.000000
       141184.000000



In [12]:

    
top_females.median()









    Out[12]:





accepted_rate                0.333333
answers_accepted_total      22.000000
answers_total               70.000000
mean_utility                 0.031842
questions_avg                1.988506
questions_total             12.500000
reputation                2027.500000
dtype: float64

Top Men



In [13]:

    
top_males = males[males["reputation"]> 1000]
top_males.describe()









    Out[13]:






  
    
      
      accepted_rate
      answers_accepted_total
      answers_total
      mean_utility
      questions_avg
      questions_total
      reputation
    
  
  
    
      count
       21572.000000
       21572.000000
       21572.000000
       3925.000000
       21572.000000
       21572.000000
        21572.000000
    
    
      mean
           0.346288
          60.458743
         161.496755
          0.157097
           4.593401
          29.144029
         5655.511218
    
    
      std
           0.148891
         220.917545
         448.405582
          0.613714
          19.086542
          56.172743
        15036.391721
    
    
      min
           0.000000
           0.000000
           0.000000
         -1.582612
          -5.000000
           0.000000
         1001.000000
    
    
      25%
           0.250000
          11.000000
          38.000000
         -0.198330
           1.000000
           3.000000
         1425.000000
    
    
      50%
           0.333333
          24.000000
          73.000000
          0.067474
           2.246664
          10.000000
         2272.500000
    
    
      75%
           0.432692
          50.000000
         146.000000
          0.362103
           4.375000
          31.000000
         4612.250000
    
    
      max
           1.000000
       16730.000000
       28137.000000
          5.313012
        1300.000000
        1327.000000
       640237.000000



In [14]:

    
top_males.median()









    Out[14]:





accepted_rate                0.333333
answers_accepted_total      24.000000
answers_total               73.000000
mean_utility                 0.067474
questions_avg                2.246664
questions_total             10.000000
reputation                2272.500000
dtype: float64

Common women contributors



In [15]:

    
common_females = females[females["reputation"] <= 1000]
common_females.describe()









    Out[15]:






  
    
      
      accepted_rate
      answers_accepted_total
      answers_total
      mean_utility
      questions_avg
      questions_total
      reputation
    
  
  
    
      count
       7029.000000
       7029.000000
       7029.000000
       3980.000000
       7029.000000
       7029.000000
       7029.000000
    
    
      mean
          0.261095
          2.016361
          7.228624
         -0.025835
          1.828188
         10.440176
        205.756011
    
    
      std
          0.312030
          3.287563
         10.690553
          0.787675
          5.031152
         17.028885
        203.109675
    
    
      min
          0.000000
          0.000000
          0.000000
         -1.827701
         -4.000000
          0.000000
         50.000000
    
    
      25%
          0.000000
          0.000000
          1.000000
         -0.625221
          0.000000
          1.000000
         71.000000
    
    
      50%
          0.166667
          1.000000
          3.000000
         -0.170780
          0.800000
          5.000000
        116.000000
    
    
      75%
          0.428571
          3.000000
          9.000000
          0.463401
          1.800000
         13.000000
        271.000000
    
    
      max
          1.000000
         55.000000
        109.000000
          3.697290
        196.000000
        205.000000
        999.000000



In [16]:

    
common_females.median()









    Out[16]:





accepted_rate               0.166667
answers_accepted_total      1.000000
answers_total               3.000000
mean_utility               -0.170780
questions_avg               0.800000
questions_total             5.000000
reputation                116.000000
dtype: float64

Common men contributors



In [17]:

    
common_males = males[males["reputation"] <= 1000]
common_males.describe()









    Out[17]:






  
    
      
      accepted_rate
      answers_accepted_total
      answers_total
      mean_utility
      questions_avg
      questions_total
      reputation
    
  
  
    
      count
       103768.000000
       103768.000000
       103768.000000
       63992.000000
       103768.000000
       103768.000000
       103768.000000
    
    
      mean
            0.267709
            2.362597
            8.627708
          -0.012149
            1.778742
            7.753151
          237.065878
    
    
      std
            0.289226
            3.452494
           11.440576
           0.761068
            4.481431
           13.823759
          223.953725
    
    
      min
            0.000000
            0.000000
            0.000000
          -2.236068
           -6.000000
            0.000000
           50.000000
    
    
      25%
            0.000000
            0.000000
            1.000000
          -0.573193
            0.000000
            0.000000
           77.000000
    
    
      50%
            0.200000
            1.000000
            5.000000
          -0.149071
            0.789474
            3.000000
          133.000000
    
    
      75%
            0.411765
            3.000000
           11.000000
           0.408880
            2.000000
            9.000000
          340.000000
    
    
      max
            1.000000
           48.000000
          157.000000
           5.053353
          325.000000
          519.000000
         1000.000000



In [18]:

    
common_males.median()









    Out[18]:





accepted_rate               0.200000
answers_accepted_total      1.000000
answers_total               5.000000
mean_utility               -0.149071
questions_avg               0.789474
questions_total             3.000000
reputation                133.000000
dtype: float64

Second Question: The contributions made by both genders are perceived with the same quality by the community ?

Hypothesis 1: Both genders have the same acceptance rate.

H0: acceptanceRate(Males) = acceptanceRate(Females);

H1: acceptanceRate(Males) != acceptanceRate(Females).

Data

It doesn't make sense to verify quality of something that haven't been done. So, we don't verify users who didn't post any answers.



In [19]:

    
females_acc_rate = females[females['answers_total'] > 0]['accepted_rate']
males_acc_rate = males[males['answers_total'] > 0]['accepted_rate']

The data's shape



In [21]:

    
show_data_shape(females_acc_rate, males_acc_rate, "norm", 50, "Accepted Rate")









    












    












    












    












    



Levene's test:  5.08482305788e-36
Skewness for Females:  0.923995860446
Skewness for Males:  0.97890009617

Hypothesis test



In [22]:

    
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_acc_rate, males_acc_rate)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_acc_rate, males_acc_rate, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_acc_rate, males_acc_rate)[1]









    



Two-sample Kolmogorov-Smirnov test:  9.97826700665e-09
Two-sample unpaired t-test:  0.00258356422005
Two-sample Mann Whitney U test:  0.482122703116

Looking at the top contributors



In [23]:

    
top_females_acc_rate = top_females[top_females['answers_total'] > 0]['accepted_rate']
top_males_acc_rate = top_males[top_males['answers_total'] > 0]['accepted_rate']

The data's shape



In [24]:

    
show_data_shape(top_females_acc_rate, top_males_acc_rate, "expon", 30, "Accepted Rate")









    












    












    












    












    



Levene's test:  0.296995754406
Skewness for Females:  0.780519551776
Skewness for Males:  0.651412780679

Hypothesis test



In [25]:

    
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_acc_rate, top_males_acc_rate)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_acc_rate, top_males_acc_rate)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_acc_rate, top_males_acc_rate)[1]









    



Two-sample Kolmogorov-Smirnov test:  0.937652401574
Two-sample unpaired t-test:  0.932905756436
Two-sample Mann Whitney U test:  0.811851432448

Looking at the common contributors



In [26]:

    
common_females_acc_rate = common_females[common_females['answers_total'] > 0]['accepted_rate']
common_males_acc_rate = common_males[common_males['answers_total'] > 0]['accepted_rate']

The data's shape



In [27]:

    
show_data_shape(common_females_acc_rate, common_males_acc_rate, "expon", 30, "Accepted Rate")









    












    












    












    












    



Levene's test:  6.83303385742e-23
Skewness for Females:  0.928582303983
Skewness for Males:  1.02388746299

Hypotesis test



In [28]:

    
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_acc_rate, common_males_acc_rate)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_acc_rate, common_males_acc_rate, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_acc_rate, common_males_acc_rate)[1]









    



Two-sample Kolmogorov-Smirnov test:  2.06314353468e-07
Two-sample unpaired t-test:  9.96882818633e-05
Two-sample Mann Whitney U test:  0.326055127477

Hypothesis 2: The mean utility of the answer for each user is the same between genders.

H0: meanUtilityBy(Males) = meanUtilityBy(Females);

H1: meanUtilityBy(Males) != meanUtilityBy(Females).

Data

It doesn't make sense to verify quality of something that haven't been done. So, we don't verify users who didn't post any answers.



In [29]:

    
females_mean_utility = females['mean_utility'].dropna()
males_mean_utility = males['mean_utility'].dropna()

The data's shape



In [30]:

    
show_data_shape(females_mean_utility, males_mean_utility, "expon", 30, "Mean Utility Answers")









    












    












    












    












    



Levene's test:  9.09599844459e-05
Skewness for Females:  0.762636587448
Skewness for Males:  0.835495912062

Hypothesis test



In [31]:

    
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_mean_utility, males_mean_utility)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_mean_utility, males_mean_utility, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_mean_utility, males_mean_utility)[1]









    



Two-sample Kolmogorov-Smirnov test:  0.000119450412226
Two-sample unpaired t-test:  0.121620730324
Two-sample Mann Whitney U test:  0.00814208803127

Looking at the top contributors



In [32]:

    
top_females_mean_utility = top_females['mean_utility'].dropna()
top_males_mean_utility = top_males['mean_utility'].dropna()

The data's shape



In [33]:

    
show_data_shape(top_females_mean_utility, top_males_mean_utility, "expon", 30, "Mean Utility Answers")









    












    












    












    












    



Levene's test:  0.161764679201
Skewness for Females:  1.49352955153
Skewness for Males:  2.20029630132

Hypotesis test



In [34]:

    
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_mean_utility, top_males_mean_utility)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_mean_utility, top_males_mean_utility)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_mean_utility, top_males_mean_utility)[1]









    



Two-sample Kolmogorov-Smirnov test:  0.000473784166564
Two-sample unpaired t-test:  0.0632994603903
Two-sample Mann Whitney U test:  0.0378178001369

Looking at the common contributors



In [35]:

    
common_females_mean_utility = common_females['mean_utility'].dropna()
common_males_mean_utility = common_males['mean_utility'].dropna()

The data's shape



In [36]:

    
show_data_shape(common_females_mean_utility, common_males_mean_utility, "expon", 30, "Mean Utility Answers")









    












    












    












    












    



Levene's test:  0.000428144054351
Skewness for Females:  0.751442870121
Skewness for Males:  0.805717651822

Hypotesis test



In [37]:

    
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_mean_utility, common_males_mean_utility)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_mean_utility, common_males_mean_utility, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_mean_utility, common_males_mean_utility)[1]









    



Two-sample Kolmogorov-Smirnov test:  0.00148400371667
Two-sample unpaired t-test:  0.286649052496
Two-sample Mann Whitney U test:  0.043008849874

Hypothesis 3: The mean of the scores of the questions made by each user doesn't change between genders.

H0: questionScoreMean(Males) = questionScoreMean(Females);

H1: questionScoreMean(Males) != questionScoreMean(Females).

Data

It doesn't make sense to verify quality of something that haven't been done. So, we don't verify users who didn't post any questions.



In [38]:

    
females_questions_mean = females[females['questions_total'] > 0]['questions_avg']
males_questions_mean = males[males['questions_total'] > 0]['questions_avg']

The shape of the data



In [39]:

    
show_data_shape(females_questions_mean, males_questions_mean, "expon", 30, "Mean Score Questions")









    












    












    












    












    



Levene's test:  0.0103241068597
Skewness for Females:  14.2790070694
Skewness for Males:  54.6242657409

Hypothesis test



In [40]:

    
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_questions_mean, males_questions_mean)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_questions_mean, males_questions_mean, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_questions_mean, males_questions_mean)[1]









    



Two-sample Kolmogorov-Smirnov test:  3.44842321018e-36
Two-sample unpaired t-test:  4.22941300745e-07
Two-sample Mann Whitney U test:  1.5131558872e-36

Looking at the contributors



In [41]:

    
top_females_questions_mean = top_females[top_females['questions_total'] > 0]['questions_avg']
top_males_questions_mean = top_males[top_males['questions_total'] > 0]['questions_avg']

The data's shape



In [42]:

    
show_data_shape(top_females_questions_mean, top_males_questions_mean, "expon", 30, "Mean Score Questions")









    












    












    












    












    



Levene's test:  0.344968587838
Skewness for Females:  10.3789676902
Skewness for Males:  33.6598705553

Hypotesis test



In [43]:

    
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(top_females_questions_mean, top_males_questions_mean)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(top_females_questions_mean, top_males_questions_mean)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_questions_mean, top_males_questions_mean)[1]









    



Two-sample Kolmogorov-Smirnov test:  1.37202186703e-07
Two-sample unpaired t-test:  0.208913893049
Two-sample Mann Whitney U test:  1.56635411444e-07

Looking at the common contributors



In [44]:

    
common_females_questions_mean = common_females[common_females['questions_total'] > 0]['questions_avg']
common_males_questions_mean = common_males[common_males['questions_total'] > 0]['questions_avg']

The data's shape



In [45]:

    
show_data_shape(common_females_questions_mean, common_males_questions_mean, "expon", 30, "Mean Score Questions")









    












    












    












    












    



Levene's test:  0.28003070781
Skewness for Females:  14.5549635989
Skewness for Males:  13.2888573576

Hypotesis test



In [46]:

    
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(common_females_questions_mean, common_males_questions_mean)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(common_females_questions_mean, common_males_questions_mean)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_questions_mean, common_males_questions_mean)[1]









    



Two-sample Kolmogorov-Smirnov test:  3.68236021391e-17
Two-sample unpaired t-test:  0.0604102276128
Two-sample Mann Whitney U test:  2.43897237729e-14

Hypothesis 4: Reputation is the same between genders.

H0: reputation(Males) = reputation(Females);

H1: reputation(Males) != reputation(Females).



In [47]:

    
females_reputation = females['reputation']
males_reputation = males['reputation']



In [48]:

    
show_data_shape(females_reputation, males_reputation, "expon", 50, "Reputation")









    












    












    












    












    



Levene's test:  5.7427856712e-10
Skewness for Females:  20.5032200007
Skewness for Males:  31.7655218713

Hypothesis test



In [49]:

    
print "Two-sample Kolmogorov-Smirnov test: ", stats.ks_2samp(females_reputation, males_reputation)[1]
print "Two-sample unpaired t-test: ", stats.ttest_ind(females_reputation, males_reputation, equal_var=False)[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_reputation, males_reputation)[1]









    



Two-sample Kolmogorov-Smirnov test:  3.78635601452e-61
Two-sample unpaired t-test:  2.73694656006e-27
Two-sample Mann Whitney U test:  8.98048646755e-81

Inicialization

Here you can find the data importing and some useful functions used for analysing the data. Please, run this first, otherwise the analysis will not work.

Importing the data from the MongoDB database and inserting into a panda dataframe for easy manipulation.



In [1]:

    
from __future__ import division
import pymongo, time, pylab, numpy, pandas
from scipy import stats
import matplotlib as mpl
from matplotlib import pyplot

%matplotlib inline

client = pymongo.MongoClient('localhost', 27017)

community = 'stackoverflow'
stats_db = client[community].statistics

cursor = stats_db.find({'$or': [{'questions_total':{'$gt':0}}, {'answers_total':{'$gt':0}}, {'comments_total':{'$gt':0}}] },
                       {u'_id': False, u'accepted_rate': True, u'reputation': True,
                        u'questions_avg': True, u'answers_total': True, u'gender':True,
                        u'questions_total': True, u'answers_accepted_total': True,
                        u'mean_utility': True,})

df =  pandas.DataFrame(list(cursor))

males = df[df['gender']=='Male']
females = df[df['gender']=='Female']

Utility functions for ploting.



In [2]:

    
pyplot.rcdefaults()
mpl.style.use('ggplot')

def histogram(sample1, sample2, bins, aspect):
    fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
    axes[0].hist(list(sample1), bins)
    axes[0].set_title(aspect + " by Females - Histogram")
    axes[1].hist(list(sample2), bins)
    axes[1].set_title(aspect + " by Males - Histogram")

def pdf_plot(sample1, sample2, aspect):
    fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
#     pdf_max = 20
#     xs = numpy.linspace(0,pdf_max,200)

#     d= stats.gaussian_kde(list(sample1))
#     d.covariance_factor = lambda : .25
#     d._compute_covariance()
#     axes[0].plot(xs,d(xs))

#     d= stats.gaussian_kde(list(sample2))
#     d.covariance_factor = lambda : .25
#     d._compute_covariance()
#     axes[1].plot(xs,d(xs))
    axes[0] = sample1.plot(ax=axes[0], kind="density")
    axes[1] = sample2.plot(ax=axes[1], kind="density")
    
    axes[0].set_title(aspect + " by Females - Density")
    axes[1].set_title(aspect + " by Males - Density")

    
    
def boxplot(sample1, sample2, aspect):
    fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
    axes[0].boxplot(list(sample1))
    axes[0].set_title(aspect + " by Females - Boxplot")
    axes[1].boxplot(list(sample2))
    axes[1].set_title(aspect + " by Males - Boxplot")
    

def qq_plot(sample1, sample2, distribution, aspect):
    fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))

    pyplot.subplot(121)
    stats.probplot(list(sample1), dist=distribution, plot=pyplot)
    axes[0].set_title(aspect + " by Females - QQPlot "+ distribution)

    pyplot.subplot(122)
    stats.probplot(list(sample2), dist=distribution, plot=pyplot)
    axes[1].set_title(aspect + " by Male - QQPlot "+ distribution)

Utility functions for describing the data.



In [3]:

    
def describe(sample1, sample2):
    print sample1.describe()
    print "Median: ", sample1.median()
    print 
    print sample2.describe()
    print "Median: ", sample2.median()
    
def show_data_shape(sample1, sample2, dist, bins, aspect):
    pyplot.close('all')
    #histogram
    histogram(sample1, sample2, bins, aspect)

    #PDF
    pdf_plot(sample1, sample2, aspect)

    #QQPlot
    qq_plot(sample1, sample2, dist, aspect)

    #boxplot
    boxplot(sample1,sample2, aspect)
    pyplot.show()

    #Levene
    print "Levene's test: ", stats.levene(sample1, sample2)[1]
    
    #skewness
    print "Skewness for Females: ", stats.skew(sample1)
    print "Skewness for Males: ", stats.skew(sample2)



In [ ]:

	accepted_rate	answers_accepted_total	answers_total	mean_utility	questions_avg	questions_total	reputation
count	7897.000000	7897.000000	7897.000000	4161.000000	7897.000000	7897.000000	7897.000000
mean	0.270054	7.582246	22.341775	-0.021649	2.056335	13.543624	694.373939
std	0.300061	43.464103	102.089690	0.781284	5.913016	31.424080	3526.726330
min	0.000000	0.000000	0.000000	-1.827701	-4.000000	0.000000	50.000000
25%	0.000000	0.000000	1.000000	-0.607915	0.058824	1.000000	75.000000
50%	0.200000	1.000000	4.000000	-0.160692	0.909091	5.000000	132.000000
75%	0.428571	4.000000	13.000000	0.457107	2.000000	14.000000	394.000000
max	1.000000	1445.000000	3249.000000	3.697290	196.000000	728.000000	141184.000000

	accepted_rate	answers_accepted_total	answers_total	mean_utility	questions_avg	questions_total	reputation
count	125340.000000	125340.000000	125340.000000	67917.000000	125340.000000	125340.000000	125340.000000
mean	0.281233	12.361401	34.937674	-0.002369	2.263167	11.434690	1169.622946
std	0.271936	94.287404	195.043927	0.754368	8.969468	27.684822	6567.785536
min	0.000000	0.000000	0.000000	-2.236068	-6.000000	0.000000	50.000000
25%	0.000000	0.000000	2.000000	-0.548726	0.000000	1.000000	86.000000
50%	0.250000	2.000000	6.000000	-0.130009	1.000000	4.000000	176.000000
75%	0.421053	6.000000	20.000000	0.404377	2.277778	12.000000	606.000000
max	1.000000	16730.000000	28137.000000	5.313012	1300.000000	1327.000000	640237.000000

	accepted_rate	answers_accepted_total	answers_total	mean_utility	questions_avg	questions_total	reputation
count	868.000000	868.000000	868.000000	181.000000	868.000000	868.000000	868.000000
mean	0.342600	52.654378	144.726959	0.070394	3.903852	38.675115	4651.165899
std	0.157587	121.787467	277.750081	0.619072	10.459406	77.020379	9763.712921
min	0.000000	0.000000	0.000000	-1.000000	-2.500000	0.000000	1002.000000
25%	0.244309	10.000000	32.000000	-0.373423	0.944153	3.000000	1330.750000
50%	0.333333	22.000000	70.000000	0.031842	1.988506	12.500000	2027.500000
75%	0.437363	46.000000	133.250000	0.383406	3.671875	38.000000	3837.250000
max	1.000000	1445.000000	3249.000000	3.107033	173.437500	728.000000	141184.000000

	accepted_rate	answers_accepted_total	answers_total	mean_utility	questions_avg	questions_total	reputation
count	21572.000000	21572.000000	21572.000000	3925.000000	21572.000000	21572.000000	21572.000000
mean	0.346288	60.458743	161.496755	0.157097	4.593401	29.144029	5655.511218
std	0.148891	220.917545	448.405582	0.613714	19.086542	56.172743	15036.391721
min	0.000000	0.000000	0.000000	-1.582612	-5.000000	0.000000	1001.000000
25%	0.250000	11.000000	38.000000	-0.198330	1.000000	3.000000	1425.000000
50%	0.333333	24.000000	73.000000	0.067474	2.246664	10.000000	2272.500000
75%	0.432692	50.000000	146.000000	0.362103	4.375000	31.000000	4612.250000
max	1.000000	16730.000000	28137.000000	5.313012	1300.000000	1327.000000	640237.000000

	accepted_rate	answers_accepted_total	answers_total	mean_utility	questions_avg	questions_total	reputation
count	7029.000000	7029.000000	7029.000000	3980.000000	7029.000000	7029.000000	7029.000000
mean	0.261095	2.016361	7.228624	-0.025835	1.828188	10.440176	205.756011
std	0.312030	3.287563	10.690553	0.787675	5.031152	17.028885	203.109675
min	0.000000	0.000000	0.000000	-1.827701	-4.000000	0.000000	50.000000
25%	0.000000	0.000000	1.000000	-0.625221	0.000000	1.000000	71.000000
50%	0.166667	1.000000	3.000000	-0.170780	0.800000	5.000000	116.000000
75%	0.428571	3.000000	9.000000	0.463401	1.800000	13.000000	271.000000
max	1.000000	55.000000	109.000000	3.697290	196.000000	205.000000	999.000000

	accepted_rate	answers_accepted_total	answers_total	mean_utility	questions_avg	questions_total	reputation
count	103768.000000	103768.000000	103768.000000	63992.000000	103768.000000	103768.000000	103768.000000
mean	0.267709	2.362597	8.627708	-0.012149	1.778742	7.753151	237.065878
std	0.289226	3.452494	11.440576	0.761068	4.481431	13.823759	223.953725
min	0.000000	0.000000	0.000000	-2.236068	-6.000000	0.000000	50.000000
25%	0.000000	0.000000	1.000000	-0.573193	0.000000	0.000000	77.000000
50%	0.200000	1.000000	5.000000	-0.149071	0.789474	3.000000	133.000000
75%	0.411765	3.000000	11.000000	0.408880	2.000000	9.000000	340.000000
max	1.000000	48.000000	157.000000	5.053353	325.000000	519.000000	1000.000000