Community: Programmers

Inicialization and importing data are at the end of this notebook. For better visualization of the analysis they were placed at the bottom, but it's necessary to run them first so the analysis work as expected. Click here to go there.

Data Segregation

All contributors



In [17]:

    
males = df[df['gender']=='Male']
females = df[df['gender']=='Female']
all_users = df.copy()



In [18]:

    
print females.median()
females.describe()









    



activity_freq      1.200000
days_active        2.000000
gender_cat         0.000000
lifetime           4.721997
max_interval       0.000000
reputation       127.000000
dead               1.000000
dtype: float64






    Out[18]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       383.000000
       383.000000
       383
        383.000000
       383.000000
         383.000000
             383
    
    
      mean
         1.629292
        10.091384
         0
        198.649195
        66.091384
         472.519582
       0.8929504
    
    
      std
         1.055821
        34.690245
         0
        316.532569
       133.999979
        1833.622104
       0.3095807
    
    
      min
         1.000000
         1.000000
         0
          0.000000
         0.000000
          51.000000
           False
    
    
      25%
         1.000000
         1.000000
         0
          0.000000
         0.000000
         101.000000
               1
    
    
      50%
         1.200000
         2.000000
         0
          4.721997
         0.000000
         127.000000
               1
    
    
      75%
         2.000000
         5.000000
         0
        302.721478
        71.500000
         213.500000
               1
    
    
      max
         9.000000
       370.000000
         0
       1218.029161
       776.000000
       24391.000000
            True



In [19]:

    
print males.median()
males.describe()









    



activity_freq      1.285714
days_active        2.000000
gender_cat         1.000000
lifetime          12.440703
max_interval       0.000000
reputation       133.000000
dead               1.000000
dtype: float64






    Out[19]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       8734.000000
       8734.000000
       8734
       8734.000000
       8734.000000
        8734.000000
            8734
    
    
      mean
          1.637862
          8.804900
          1
        209.364255
         75.435081
         421.200939
       0.8970689
    
    
      std
          1.004615
         31.260074
          0
        316.084008
        149.071970
        1759.699954
       0.3038862
    
    
      min
          1.000000
          1.000000
          1
          0.000000
          0.000000
          50.000000
           False
    
    
      25%
          1.000000
          1.000000
          1
          0.000000
          0.000000
         103.000000
               1
    
    
      50%
          1.285714
          2.000000
          1
         12.440703
          0.000000
         133.000000
               1
    
    
      75%
          2.000000
          5.000000
          1
        328.129638
         88.000000
         239.000000
               1
    
    
      max
         18.000000
        755.000000
          1
       1233.767837
       1161.000000
       62314.000000
            True



In [20]:

    
print all_users.median()
all_users.describe()









    



activity_freq      1.280000
days_active        2.000000
gender_cat         1.000000
lifetime          11.966711
max_interval       0.000000
reputation       133.000000
dead               1.000000
dtype: float64






    Out[20]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       9117.000000
       9117.000000
       9117.000000
       9117.000000
       9117.000000
        9117.000000
            9117
    
    
      mean
          1.637502
          8.858945
          0.957991
        208.914122
         75.042558
         423.356806
       0.8968959
    
    
      std
          1.006759
         31.410687
          0.200622
        316.092791
        148.474730
        1762.793560
       0.3041115
    
    
      min
          1.000000
          1.000000
          0.000000
          0.000000
          0.000000
          50.000000
           False
    
    
      25%
          1.000000
          1.000000
          1.000000
          0.000000
          0.000000
         103.000000
               1
    
    
      50%
          1.280000
          2.000000
          1.000000
         11.966711
          0.000000
         133.000000
               1
    
    
      75%
          2.000000
          5.000000
          1.000000
        326.909254
         88.000000
         239.000000
               1
    
    
      max
         18.000000
        755.000000
          1.000000
       1233.767837
       1161.000000
       62314.000000
            True

Top contributors



In [21]:

    
top_females = females[females["reputation"]> 450]
top_males = males[males["reputation"]> 450]

top_users = all_users[all_users["reputation"]> 450]



In [22]:

    
print top_females.median()
top_females.describe()









    



activity_freq       1.890000
days_active        32.000000
gender_cat          0.000000
lifetime          721.040561
max_interval      136.000000
reputation       1223.000000
dead                1.000000
dtype: float64






    Out[22]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       46.000000
        46.000000
       46
         46.000000
        46.000000
          46.000000
              46
    
    
      mean
        1.933474
        63.239130
        0
        679.853150
       151.043478
        2842.021739
       0.7391304
    
    
      std
        0.624939
        82.829191
        0
        383.046066
       120.207036
        4687.361966
       0.4439611
    
    
      min
        1.000000
         1.000000
        0
          0.313784
         0.000000
         454.000000
           False
    
    
      25%
        1.438034
        16.250000
        0
        331.432239
        48.500000
         685.000000
            0.25
    
    
      50%
        1.890000
        32.000000
        0
        721.040561
       136.000000
        1223.000000
               1
    
    
      75%
        2.360197
        70.500000
        0
        995.340364
       204.250000
        2505.250000
               1
    
    
      max
        3.437500
       370.000000
        0
       1218.029161
       500.000000
       24391.000000
            True



In [23]:

    
print top_males.median()
top_males.describe()









    



activity_freq      1.692308
days_active       21.000000
gender_cat         1.000000
lifetime         620.107217
max_interval     119.000000
reputation       910.000000
dead               1.000000
dtype: float64






    Out[23]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       1113.000000
       1113.000000
       1113
       1113.000000
       1113.000000
        1113.000000
            1113
    
    
      mean
          1.932771
         47.292004
          1
        596.251071
        158.495957
        2234.580413
       0.7610063
    
    
      std
          0.868659
         76.434107
          0
        376.621199
        151.640158
        4528.080814
       0.4266606
    
    
      min
          1.000000
          1.000000
          1
          0.000000
          0.000000
         451.000000
           False
    
    
      25%
          1.400000
          9.000000
          1
        263.820522
         47.000000
         606.000000
               1
    
    
      50%
          1.692308
         21.000000
          1
        620.107217
        119.000000
         910.000000
               1
    
    
      75%
          2.190476
         47.000000
          1
        920.162689
        215.000000
        1815.000000
               1
    
    
      max
          7.800000
        755.000000
          1
       1233.767837
        940.000000
       62314.000000
            True



In [24]:

    
print top_users.median()
top_users.describe()









    



activity_freq      1.695652
days_active       22.000000
gender_cat         1.000000
lifetime         625.687289
max_interval     120.000000
reputation       912.000000
dead               1.000000
dtype: float64






    Out[24]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       1159.000000
       1159.000000
       1159.000000
       1159.000000
       1159.000000
        1159.000000
            1159
    
    
      mean
          1.932799
         47.924935
          0.960311
        599.569186
        158.200173
        2258.689387
       0.7601381
    
    
      std
          0.860099
         76.722930
          0.195313
        377.064087
        150.482355
        4533.974662
       0.4271834
    
    
      min
          1.000000
          1.000000
          0.000000
          0.000000
          0.000000
         451.000000
           False
    
    
      25%
          1.400000
          9.000000
          1.000000
        268.974931
         47.000000
         606.500000
               1
    
    
      50%
          1.695652
         22.000000
          1.000000
        625.687289
        120.000000
         912.000000
               1
    
    
      75%
          2.200000
         48.000000
          1.000000
        926.605435
        214.500000
        1840.500000
               1
    
    
      max
          7.800000
        755.000000
          1.000000
       1233.767837
        940.000000
       62314.000000
            True

Common women contributors



In [25]:

    
common_females = females[females["reputation"] <= 450]
common_males = males[males["reputation"] <= 450]

common_users = all_users[all_users["reputation"]<= 450]



In [26]:

    
print common_females.median()
common_females.describe()









    



activity_freq      1.000000
days_active        2.000000
gender_cat         0.000000
lifetime           0.761798
max_interval       0.000000
reputation       121.000000
dead               1.000000
dtype: float64






    Out[26]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       337.000000
       337.000000
       337
        337.000000
       337.000000
       337.000000
             337
    
    
      mean
         1.587771
         2.836795
         0
        132.965569
        54.495549
       149.086053
       0.9139466
    
    
      std
         1.095752
         3.134877
         0
        241.310392
       131.742695
        79.347610
       0.2808599
    
    
      min
         1.000000
         1.000000
         0
          0.000000
         0.000000
        51.000000
           False
    
    
      25%
         1.000000
         1.000000
         0
          0.000000
         0.000000
       101.000000
               1
    
    
      50%
         1.000000
         2.000000
         0
          0.761798
         0.000000
       121.000000
               1
    
    
      75%
         2.000000
         3.000000
         0
        160.812156
        16.000000
       159.000000
               1
    
    
      max
         9.000000
        21.000000
         0
       1113.368895
       776.000000
       450.000000
            True



In [27]:

    
print common_females.median()
common_females.describe()









    



activity_freq      1.000000
days_active        2.000000
gender_cat         0.000000
lifetime           0.761798
max_interval       0.000000
reputation       121.000000
dead               1.000000
dtype: float64






    Out[27]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       337.000000
       337.000000
       337
        337.000000
       337.000000
       337.000000
             337
    
    
      mean
         1.587771
         2.836795
         0
        132.965569
        54.495549
       149.086053
       0.9139466
    
    
      std
         1.095752
         3.134877
         0
        241.310392
       131.742695
        79.347610
       0.2808599
    
    
      min
         1.000000
         1.000000
         0
          0.000000
         0.000000
        51.000000
           False
    
    
      25%
         1.000000
         1.000000
         0
          0.000000
         0.000000
       101.000000
               1
    
    
      50%
         1.000000
         2.000000
         0
          0.761798
         0.000000
       121.000000
               1
    
    
      75%
         2.000000
         3.000000
         0
        160.812156
        16.000000
       159.000000
               1
    
    
      max
         9.000000
        21.000000
         0
       1113.368895
       776.000000
       450.000000
            True



In [28]:

    
print common_users.median()
common_users.describe()









    



activity_freq      1.105556
days_active        2.000000
gender_cat         1.000000
lifetime           1.470971
max_interval       0.000000
reputation       123.000000
dead               1.000000
dtype: float64






    Out[28]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       7958.000000
       7958.000000
       7958.000000
       7958.000000
       7958.000000
       7958.000000
            7958
    
    
      mean
          1.594495
          3.169389
          0.957653
        152.019271
         62.931515
        156.059688
       0.9168133
    
    
      std
          1.019307
          4.359889
          0.201393
        261.362091
        144.243616
         79.193383
       0.2761816
    
    
      min
          1.000000
          1.000000
          0.000000
          0.000000
          0.000000
         50.000000
           False
    
    
      25%
          1.000000
          1.000000
          1.000000
          0.000000
          0.000000
        101.000000
               1
    
    
      50%
          1.105556
          2.000000
          1.000000
          1.470971
          0.000000
        123.000000
               1
    
    
      75%
          2.000000
          3.000000
          1.000000
        203.931563
         39.000000
        177.750000
               1
    
    
      max
         18.000000
         87.000000
          1.000000
       1221.605458
       1161.000000
        450.000000
            True

Third Question: Do women and men contribute for the same amount of time ?

Hypothesis 1: The survival time of users is the same among genders.

H0: survivalTime(Males) = survivalTime(Females);

H1: survivalTime(Males) != survivalTime(Females).

Data



In [29]:

    
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()

T = all_users["lifetime"] #measure in days
C = all_users["dead"]
T.describe()

females_ = all_users["gender"] == "Female"
males_ = all_users["gender"] == "Male"



In [30]:

    
fig = pyplot.figure(figsize=(12, 6)) 
ax = pyplot.subplot(111)

kmf.fit(T[females_], event_observed=C[females_], label="Female")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

kmf.fit(T[males_], event_observed=C[males_], label="Male")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

pyplot.ylim(0,1)
pyplot.title("Lifespans of users from different genders")









    



Median:  9.91538854167
Median:  22.2883001505






    Out[30]:





<matplotlib.text.Text at 0x115d50550>

Hypothesis test



In [31]:

    
from lifelines.statistics import logrank_test

summary, p_value, test_results = logrank_test(T[females_], T[males_], C[females_], C[males_], alpha=.95 )









    



Results
   df: 1
   alpha: 0.95
   t 0: -1
   test: logrank
   null distribution: chi squared

   __ p-value ___|__ test statistic __|__ test results __
         0.61520 |              0.253 |     None

Looking at the top contributors



In [32]:

    
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()

T = top_users["lifetime"] #measure in days
C = top_users["dead"]

females_ = top_users["gender"] == "Female"
males_ = top_users["gender"] == "Male"



In [33]:

    
fig = pyplot.figure(figsize=(12, 6)) 
ax = pyplot.subplot(111)

kmf.fit(T[females_], event_observed=C[females_], label="Female")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

kmf.fit(T[males_], event_observed=C[males_], label="Male")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

pyplot.ylim(0,1)
pyplot.title("Lifespans of users from different genders")









    



Median:  750.813918102
Median:  658.993864502






    Out[33]:





<matplotlib.text.Text at 0x11785c610>

Hypothesis test



In [34]:

    
from lifelines.statistics import logrank_test

summary, p_value, test_results = logrank_test(T[females_], T[males_], C[females_], C[males_], alpha=.95 )









    



Results
   df: 1
   alpha: 0.95
   t 0: -1
   test: logrank
   null distribution: chi squared

   __ p-value ___|__ test statistic __|__ test results __
         0.27086 |              1.212 |     None

Looking at the common contributors



In [35]:

    
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()

T = common_users["lifetime"] #measure in days
C = common_users["dead"]

females_ = common_users["gender"] == "Female"
males_ = common_users["gender"] == "Male"



In [36]:

    
fig = pyplot.figure(figsize=(12, 6)) 
ax = pyplot.subplot(111)

kmf.fit(T[females_], event_observed=C[females_], label="Female")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

kmf.fit(T[males_], event_observed=C[males_], label="Male")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

pyplot.ylim(0,1)
pyplot.title("Lifespans of users from different genders")









    



Median:  1.22552979167
Median:  3.11974934028






    Out[36]:





<matplotlib.text.Text at 0x11652df90>

Hypotesis test



In [37]:

    
from lifelines.statistics import logrank_test

summary, p_value, test_results = logrank_test(T[females_], T[males_], C[females_], C[males_], alpha=.95 )









    



Results
   df: 1
   alpha: 0.95
   t 0: -1
   test: logrank
   null distribution: chi squared

   __ p-value ___|__ test statistic __|__ test results __
         0.22567 |              1.468 |     None

Hypothesis 2: The amount of activity days is the same between genders.

H0: daysActive(Males) = daysActive(Females);

H1: daysActive(Males) != daysActive(Females).

Correlation and Binomial Negative Regression



In [22]:

    
all_users.corr(method="spearman")









    Out[22]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      activity_freq
       1.000000
       0.380506
       0.006349
       0.430490
       0.374827
       0.350092
      -0.102759
    
    
      days_active
       0.380506
       1.000000
       0.009307
       0.886377
       0.828051
       0.656185
      -0.211735
    
    
      gender_cat
       0.006349
       0.009307
       1.000000
       0.010895
       0.017022
       0.019255
       0.003507
    
    
      lifetime
       0.430490
       0.886377
       0.010895
       1.000000
       0.808751
       0.579643
      -0.239589
    
    
      max_interval
       0.374827
       0.828051
       0.017022
       0.808751
       1.000000
       0.542324
      -0.200893
    
    
      reputation
       0.350092
       0.656185
       0.019255
       0.579643
       0.542324
       1.000000
      -0.105239
    
    
      dead
      -0.102759
      -0.211735
       0.003507
      -0.239589
      -0.200893
      -0.105239
       1.000000



In [23]:

    
exog = all_users["gender_cat"]
exog = sm.add_constant(exog, prepend=True)
endog = all_users["days_active"]

mod_nbin = sm.GLM(endog, exog, family=sm.families.NegativeBinomial())

res_nbin = mod_nbin.fit()

res_nbin.summary()









    Out[23]:





Generalized Linear Model Regression Results

  Dep. Variable:      days_active      No. Observations:        9117   


  Model:                  GLM          Df Residuals:            9115   


  Model Family:    NegativeBinomial    Df Model:                   1   


  Link Function:          log          Scale:              11.295333667


  Method:                IRLS          Log-Likelihood:        -29498.  


  Date:            Sun, 05 Oct 2014    Deviance:               17829.  


  Time:                22:17:03        Pearson chi2:         1.03e+05  


  No. Iterations:          8                                           




                coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const           2.3117      0.180     12.840   0.000      1.959     2.665


  gender_cat     -0.1364      0.184     -0.741   0.459     -0.497     0.224

Data's shape



In [24]:

    
nobs = res_nbin.nobs

y = endog / endog.sum(1)

yhat = res_nbin.mu

pyplot.scatter(yhat, y)

res_nbin.params









    Out[24]:





const         2.311682
gender_cat   -0.136374
dtype: float64

Looking at the top contributors



In [25]:

    
top_users.corr(method="spearman")









    Out[25]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      activity_freq
       1.000000
       0.315158
      -0.028147
       0.100859
      -0.001372
       0.339114
      -0.110973
    
    
      days_active
       0.315158
       1.000000
      -0.071390
       0.644933
       0.191464
       0.733031
      -0.463801
    
    
      gender_cat
      -0.028147
      -0.071390
       1.000000
      -0.044740
      -0.009697
      -0.044905
       0.010207
    
    
      lifetime
       0.100859
       0.644933
      -0.044740
       1.000000
       0.569067
       0.452598
      -0.517108
    
    
      max_interval
      -0.001372
       0.191464
      -0.009697
       0.569067
       1.000000
       0.068937
      -0.110230
    
    
      reputation
       0.339114
       0.733031
      -0.044905
       0.452598
       0.068937
       1.000000
      -0.305872
    
    
      dead
      -0.110973
      -0.463801
       0.010207
      -0.517108
      -0.110230
      -0.305872
       1.000000



In [26]:

    
exog = top_users["gender_cat"]
exog = sm.add_constant(exog, prepend=True)
endog = top_users["days_active"]

mod_nbin = sm.GLM(endog, exog, family=sm.families.NegativeBinomial())

res_nbin = mod_nbin.fit()

res_nbin.summary()









    Out[26]:





Generalized Linear Model Regression Results

  Dep. Variable:      days_active      No. Observations:        1159    


  Model:                  GLM          Df Residuals:            1157    


  Model Family:    NegativeBinomial    Df Model:                   1    


  Link Function:          log          Scale:              2.52425586707


  Method:                IRLS          Log-Likelihood:        -5653.9   


  Date:            Sun, 05 Oct 2014    Deviance:               1753.7   


  Time:                22:20:01        Pearson chi2:         2.92e+03   


  No. Iterations:          7                                            




                coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const           4.1469      0.236     17.564   0.000      3.684     4.610


  gender_cat     -0.2906      0.241     -1.206   0.228     -0.763     0.182

Data's shape



In [27]:

    
nobs = res_nbin.nobs

y = endog / endog.sum(1)

yhat = res_nbin.mu

pyplot.scatter(yhat, y)

res_nbin.params









    Out[27]:





const         4.146923
gender_cat   -0.290582
dtype: float64

Looking at the common contributors



In [28]:

    
common_users.corr(method="spearman")









    Out[28]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      activity_freq
       1.000000
       0.320997
       0.008963
       0.415882
       0.344223
       0.280651
      -0.062252
    
    
      days_active
       0.320997
       1.000000
       0.012734
       0.877658
       0.814611
       0.515511
      -0.136177
    
    
      gender_cat
       0.008963
       0.012734
       1.000000
       0.014673
       0.019978
       0.022805
       0.003135
    
    
      lifetime
       0.415882
       0.877658
       0.014673
       1.000000
       0.767169
       0.470158
      -0.159096
    
    
      max_interval
       0.344223
       0.814611
       0.019978
       0.767169
       1.000000
       0.438500
      -0.155000
    
    
      reputation
       0.280651
       0.515511
       0.022805
       0.470158
       0.438500
       1.000000
      -0.004011
    
    
      dead
      -0.062252
      -0.136177
       0.003135
      -0.159096
      -0.155000
      -0.004011
       1.000000



In [29]:

    
exog = common_users["gender_cat"]
exog = sm.add_constant(exog, prepend=True)
endog = common_users["days_active"]

mod_nbin = sm.GLM(endog, exog, family=sm.families.NegativeBinomial())

res_nbin = mod_nbin.fit()

res_nbin.summary()









    Out[29]:





Generalized Linear Model Regression Results

  Dep. Variable:      days_active      No. Observations:        7958    


  Model:                  GLM          Df Residuals:            7956    


  Model Family:    NegativeBinomial    Df Model:                   1    


  Link Function:          log          Scale:              1.43359042321


  Method:                IRLS          Log-Likelihood:        -18277.   


  Date:            Sun, 05 Oct 2014    Deviance:               4941.1   


  Time:                22:20:29        Pearson chi2:         1.14e+04   


  No. Iterations:          6                                            




                coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const           1.0427      0.076     13.746   0.000      0.894     1.191


  gender_cat      0.1155      0.077      1.491   0.136     -0.036     0.267

Data's shape



In [30]:

    
nobs = res_nbin.nobs

y = endog / endog.sum(1)

yhat = res_nbin.mu

pyplot.scatter(yhat, y)

res_nbin.params









    Out[30]:





const         1.042675
gender_cat    0.115494
dtype: float64

Hypothesis 3: The frequency of participation is the same between genders.

H0: frequency(Males) = frequency(Females);

H1: frequency(Males) != frequency(Females).

Data



In [47]:

    
females_frequency = females['activity_freq']
males_frequency = males['activity_freq']

Data's summary



In [48]:

    
print "Female:"
print females_frequency.describe()
print "Median: ", females_frequency.median()

print 
print "Male:"
print males_frequency.describe()
print "Median: ", males_frequency.median()









    



Female:
count    383.000000
mean       1.629292
std        1.055821
min        1.000000
25%        1.000000
50%        1.200000
75%        2.000000
max        9.000000
dtype: float64
Median:  1.2

Male:
count    8734.000000
mean        1.637862
std         1.004615
min         1.000000
25%         1.000000
50%         1.285714
75%         2.000000
max        18.000000
dtype: float64
Median:  1.28571428571

The data's shape



In [49]:

    
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
females_frequency.hist(ax=axes[0])
axes[0].set_title("Post Frequency by Females - Histogram")
males_frequency.hist(ax=axes[1])
axes[1].set_title("Post Frequency by Males - Histogram")









    Out[49]:





<matplotlib.text.Text at 0x111673050>

Hypothesis test



In [51]:

    
print "Two-sample Chi-Square test: ", stats.chi2_contingency( females_frequency, males_frequency )[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_frequency, males_frequency)[1]









    



Two-sample Chi-Square test:  1.0
Two-sample Mann Whitney U test:  0.562054040279

Looking at the top contributors



In [52]:

    
top_females_frequency = top_females['activity_freq']
top_males_frequency = top_males['activity_freq']

Data's summary



In [53]:

    
print "Female:"
print top_females_frequency.describe()
print "Median: ", top_females_frequency.median()

print 
print "Male:"
print top_males_frequency.describe()
print "Median: ", top_males_frequency.median()









    



Female:
count    46.000000
mean      1.933474
std       0.624939
min       1.000000
25%       1.438034
50%       1.890000
75%       2.360197
max       3.437500
dtype: float64
Median:  1.89

Male:
count    1113.000000
mean        1.932771
std         0.868659
min         1.000000
25%         1.400000
50%         1.692308
75%         2.190476
max         7.800000
dtype: float64
Median:  1.69230769231

The data's shape



In [54]:

    
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
top_females_frequency.hist(ax=axes[0])
axes[0].set_title("Post Frequency by Females - Histogram")
top_males_frequency.hist(ax=axes[1])
axes[1].set_title("Post Frequency by Males - Histogram")









    Out[54]:





<matplotlib.text.Text at 0x1118c18d0>

Hypotesis test



In [56]:

    
print "Two-sample Chi-Square test: ", stats.chi2_contingency( top_females_frequency, top_males_frequency )[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_frequency, top_males_frequency)[1]









    



Two-sample Chi-Square test:  1.0
Two-sample Mann Whitney U test:  0.338258112935

Looking at the common contributors



In [57]:

    
common_females_frequency = common_females['activity_freq']
common_males_frequency = common_males['activity_freq']

Data's summary



In [58]:

    
print "Female:"
print common_females_frequency.describe()
print "Median: ", common_females_frequency.median()

print 
print "Male:"
print common_males_frequency.describe()
print "Median: ", common_males_frequency.median()









    



Female:
count    337.000000
mean       1.587771
std        1.095752
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max        9.000000
dtype: float64
Median:  1.0

Male:
count    7621.000000
mean        1.594792
std         1.015870
min         1.000000
25%         1.000000
50%         1.111111
75%         2.000000
max        18.000000
dtype: float64
Median:  1.11111111111

The data's shape



In [59]:

    
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
common_females_frequency.hist(ax=axes[0])
axes[0].set_title("Post Frequency by Females - Histogram")
common_males_frequency.hist(ax=axes[1])
axes[1].set_title("Post Frequency by Males - Histogram")









    Out[59]:





<matplotlib.text.Text at 0x111c05a90>

Hypotesis test



In [60]:

    
print "Two-sample Chi-Square test: ", stats.chi2_contingency( common_females_frequency, common_males_frequency )[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_frequency, common_males_frequency)[1]









    



Two-sample Chi-Square test:  1.0
Two-sample Mann Whitney U test:  0.453230652257

Inicialization

Here you can find the data importing and some useful functions used for analysing the data. Please, run this first, otherwise the analysis will not work.

Importing the data from the MongoDB database and inserting into a panda dataframe for easy manipulation.



In [15]:

    
from __future__ import division
import pymongo, time, pylab, numpy, pandas, math
from scipy import stats
import matplotlib as mpl
from matplotlib import pyplot
import statsmodels.api as sm

%matplotlib inline
mpl.style.use('ggplot')
# pyplot.rcdefaults()

client = pymongo.MongoClient('localhost', 27017)

community = 'programmers'
stats_db = client[community].statistics

cursor = stats_db.find({'$or': [{'questions_total':{'$gt':0}}, {'answers_total':{'$gt':0}}, 
                                {'comments_total':{'$gt':0}}], 
                        'gender': {'$ne': "Unknown"} }, 
                       {u'_id': False, u'dates': True, u'reputation': True,
                        u'joined': True, u'gender':True,
                        'lifetime': True, 'max_interval': True, 'days_active': True, 
                        'gender_cat': True, 'activity_freq': True})

df =  pandas.DataFrame(list(cursor))



In [16]:

    
import datetime

def seen_death(row):
    recent = datetime.datetime(2014,1,20) - datetime.timedelta(days=int(df["max_interval"].mean()))
    return row["dates"][-1] < recent

df["dead"] = df.apply(seen_death, axis=1)



In [ ]:

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	383.000000	383.000000	383	383.000000	383.000000	383.000000	383
mean	1.629292	10.091384	0	198.649195	66.091384	472.519582	0.8929504
std	1.055821	34.690245	0	316.532569	133.999979	1833.622104	0.3095807
min	1.000000	1.000000	0	0.000000	0.000000	51.000000	False
25%	1.000000	1.000000	0	0.000000	0.000000	101.000000	1
50%	1.200000	2.000000	0	4.721997	0.000000	127.000000	1
75%	2.000000	5.000000	0	302.721478	71.500000	213.500000	1
max	9.000000	370.000000	0	1218.029161	776.000000	24391.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	8734.000000	8734.000000	8734	8734.000000	8734.000000	8734.000000	8734
mean	1.637862	8.804900	1	209.364255	75.435081	421.200939	0.8970689
std	1.004615	31.260074	0	316.084008	149.071970	1759.699954	0.3038862
min	1.000000	1.000000	1	0.000000	0.000000	50.000000	False
25%	1.000000	1.000000	1	0.000000	0.000000	103.000000	1
50%	1.285714	2.000000	1	12.440703	0.000000	133.000000	1
75%	2.000000	5.000000	1	328.129638	88.000000	239.000000	1
max	18.000000	755.000000	1	1233.767837	1161.000000	62314.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	9117.000000	9117.000000	9117.000000	9117.000000	9117.000000	9117.000000	9117
mean	1.637502	8.858945	0.957991	208.914122	75.042558	423.356806	0.8968959
std	1.006759	31.410687	0.200622	316.092791	148.474730	1762.793560	0.3041115
min	1.000000	1.000000	0.000000	0.000000	0.000000	50.000000	False
25%	1.000000	1.000000	1.000000	0.000000	0.000000	103.000000	1
50%	1.280000	2.000000	1.000000	11.966711	0.000000	133.000000	1
75%	2.000000	5.000000	1.000000	326.909254	88.000000	239.000000	1
max	18.000000	755.000000	1.000000	1233.767837	1161.000000	62314.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	46.000000	46.000000	46	46.000000	46.000000	46.000000	46
mean	1.933474	63.239130	0	679.853150	151.043478	2842.021739	0.7391304
std	0.624939	82.829191	0	383.046066	120.207036	4687.361966	0.4439611
min	1.000000	1.000000	0	0.313784	0.000000	454.000000	False
25%	1.438034	16.250000	0	331.432239	48.500000	685.000000	0.25
50%	1.890000	32.000000	0	721.040561	136.000000	1223.000000	1
75%	2.360197	70.500000	0	995.340364	204.250000	2505.250000	1
max	3.437500	370.000000	0	1218.029161	500.000000	24391.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	1113.000000	1113.000000	1113	1113.000000	1113.000000	1113.000000	1113
mean	1.932771	47.292004	1	596.251071	158.495957	2234.580413	0.7610063
std	0.868659	76.434107	0	376.621199	151.640158	4528.080814	0.4266606
min	1.000000	1.000000	1	0.000000	0.000000	451.000000	False
25%	1.400000	9.000000	1	263.820522	47.000000	606.000000	1
50%	1.692308	21.000000	1	620.107217	119.000000	910.000000	1
75%	2.190476	47.000000	1	920.162689	215.000000	1815.000000	1
max	7.800000	755.000000	1	1233.767837	940.000000	62314.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	1159.000000	1159.000000	1159.000000	1159.000000	1159.000000	1159.000000	1159
mean	1.932799	47.924935	0.960311	599.569186	158.200173	2258.689387	0.7601381
std	0.860099	76.722930	0.195313	377.064087	150.482355	4533.974662	0.4271834
min	1.000000	1.000000	0.000000	0.000000	0.000000	451.000000	False
25%	1.400000	9.000000	1.000000	268.974931	47.000000	606.500000	1
50%	1.695652	22.000000	1.000000	625.687289	120.000000	912.000000	1
75%	2.200000	48.000000	1.000000	926.605435	214.500000	1840.500000	1
max	7.800000	755.000000	1.000000	1233.767837	940.000000	62314.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	337.000000	337.000000	337	337.000000	337.000000	337.000000	337
mean	1.587771	2.836795	0	132.965569	54.495549	149.086053	0.9139466
std	1.095752	3.134877	0	241.310392	131.742695	79.347610	0.2808599
min	1.000000	1.000000	0	0.000000	0.000000	51.000000	False
25%	1.000000	1.000000	0	0.000000	0.000000	101.000000	1
50%	1.000000	2.000000	0	0.761798	0.000000	121.000000	1
75%	2.000000	3.000000	0	160.812156	16.000000	159.000000	1
max	9.000000	21.000000	0	1113.368895	776.000000	450.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	7958.000000	7958.000000	7958.000000	7958.000000	7958.000000	7958.000000	7958
mean	1.594495	3.169389	0.957653	152.019271	62.931515	156.059688	0.9168133
std	1.019307	4.359889	0.201393	261.362091	144.243616	79.193383	0.2761816
min	1.000000	1.000000	0.000000	0.000000	0.000000	50.000000	False
25%	1.000000	1.000000	1.000000	0.000000	0.000000	101.000000	1
50%	1.105556	2.000000	1.000000	1.470971	0.000000	123.000000	1
75%	2.000000	3.000000	1.000000	203.931563	39.000000	177.750000	1
max	18.000000	87.000000	1.000000	1221.605458	1161.000000	450.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
activity_freq	1.000000	0.380506	0.006349	0.430490	0.374827	0.350092	-0.102759
days_active	0.380506	1.000000	0.009307	0.886377	0.828051	0.656185	-0.211735
gender_cat	0.006349	0.009307	1.000000	0.010895	0.017022	0.019255	0.003507
lifetime	0.430490	0.886377	0.010895	1.000000	0.808751	0.579643	-0.239589
max_interval	0.374827	0.828051	0.017022	0.808751	1.000000	0.542324	-0.200893
reputation	0.350092	0.656185	0.019255	0.579643	0.542324	1.000000	-0.105239
dead	-0.102759	-0.211735	0.003507	-0.239589	-0.200893	-0.105239	1.000000

Dep. Variable:	days_active	No. Observations:	9117
Model:	GLM	Df Residuals:	9115
Model Family:	NegativeBinomial	Df Model:	1
Link Function:	log	Scale:	11.295333667
Method:	IRLS	Log-Likelihood:	-29498.
Date:	Sun, 05 Oct 2014	Deviance:	17829.
Time:	22:17:03	Pearson chi2:	1.03e+05
No. Iterations:	8

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
const	2.3117	0.180	12.840	0.000	1.959 2.665
gender_cat	-0.1364	0.184	-0.741	0.459	-0.497 0.224

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
activity_freq	1.000000	0.315158	-0.028147	0.100859	-0.001372	0.339114	-0.110973
days_active	0.315158	1.000000	-0.071390	0.644933	0.191464	0.733031	-0.463801
gender_cat	-0.028147	-0.071390	1.000000	-0.044740	-0.009697	-0.044905	0.010207
lifetime	0.100859	0.644933	-0.044740	1.000000	0.569067	0.452598	-0.517108
max_interval	-0.001372	0.191464	-0.009697	0.569067	1.000000	0.068937	-0.110230
reputation	0.339114	0.733031	-0.044905	0.452598	0.068937	1.000000	-0.305872
dead	-0.110973	-0.463801	0.010207	-0.517108	-0.110230	-0.305872	1.000000

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
activity_freq	1.000000	0.320997	0.008963	0.415882	0.344223	0.280651	-0.062252
days_active	0.320997	1.000000	0.012734	0.877658	0.814611	0.515511	-0.136177
gender_cat	0.008963	0.012734	1.000000	0.014673	0.019978	0.022805	0.003135
lifetime	0.415882	0.877658	0.014673	1.000000	0.767169	0.470158	-0.159096
max_interval	0.344223	0.814611	0.019978	0.767169	1.000000	0.438500	-0.155000
reputation	0.280651	0.515511	0.022805	0.470158	0.438500	1.000000	-0.004011
dead	-0.062252	-0.136177	0.003135	-0.159096	-0.155000	-0.004011	1.000000