Community: SuperUser

Inicialization and importing data are at the end of this notebook. For better visualization of the analysis they were placed at the bottom, but it's necessary to run them first so the analysis work as expected. Click here to go there.

Data Segregation

All contributors



In [3]:

    
males = df[df['gender']=='Male']
females = df[df['gender']=='Female']
all_users = df.copy()



In [4]:

    
print females.median()
females.describe()









    



activity_freq      1.333333
days_active        2.000000
gender_cat         0.000000
lifetime          46.670011
max_interval       0.000000
reputation       116.000000
dead               1.000000
dtype: float64






    Out[4]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       788.000000
       788.000000
       788
        788.000000
        788.000000
         788.000000
             788
    
    
      mean
         1.683857
         8.010152
         0
        283.543000
        117.312183
         238.482234
       0.8020305
    
    
      std
         1.049937
        29.875726
         0
        402.226189
        214.023901
         727.567127
       0.3987222
    
    
      min
         1.000000
         1.000000
         0
          0.000000
          0.000000
          51.000000
           False
    
    
      25%
         1.000000
         1.000000
         0
          0.000000
          0.000000
         101.000000
               1
    
    
      50%
         1.333333
         2.000000
         0
         46.670011
          0.000000
         116.000000
               1
    
    
      75%
         2.000000
         6.000000
         0
        479.477738
        165.000000
         156.000000
               1
    
    
      max
        10.921053
       638.000000
         0
       1615.406364
       1397.000000
       15164.000000
            True



In [5]:

    
print males.median()
males.describe()









    



activity_freq      1.333333
days_active        3.000000
gender_cat         1.000000
lifetime          75.875431
max_interval       1.000000
reputation       118.000000
dead               1.000000
dtype: float64






    Out[5]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       18264.000000
       18264.000000
       18264
       18264.000000
       18264.000000
       18264.000000
           18264
    
    
      mean
           1.611495
           8.218079
           1
         320.003430
         122.071671
         267.312308
       0.8019601
    
    
      std
           0.964111
          28.102552
           0
         431.008066
         205.042662
        1473.342909
       0.3985333
    
    
      min
           1.000000
           1.000000
           1
           0.000000
           0.000000
          50.000000
           False
    
    
      25%
           1.000000
           1.000000
           1
           0.000808
           0.000000
         101.000000
               1
    
    
      50%
           1.333333
           3.000000
           1
          75.875431
           1.000000
         118.000000
               1
    
    
      75%
           2.000000
           6.000000
           1
         547.338934
         183.000000
         170.000000
               1
    
    
      max
          29.000000
         995.000000
           1
        1649.334414
        1532.000000
       99647.000000
            True



In [6]:

    
print all_users.median()
all_users.describe()









    



activity_freq      1.333333
days_active        3.000000
gender_cat         1.000000
lifetime          73.488142
max_interval       1.000000
reputation       118.000000
dead               1.000000
dtype: float64






    Out[6]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       19052.000000
       19052.000000
       19052.000000
       19052.000000
       19052.000000
       19052.000000
           19052
    
    
      mean
           1.614488
           8.209479
           0.958640
         318.495409
         121.874816
         266.119882
        0.801963
    
    
      std
           0.967889
          28.177306
           0.199128
         429.907212
         205.418269
        1450.121524
       0.3985306
    
    
      min
           1.000000
           1.000000
           0.000000
           0.000000
           0.000000
          50.000000
           False
    
    
      25%
           1.000000
           1.000000
           1.000000
           0.000772
           0.000000
         101.000000
               1
    
    
      50%
           1.333333
           3.000000
           1.000000
          73.488142
           1.000000
         118.000000
               1
    
    
      75%
           2.000000
           6.000000
           1.000000
         545.616584
         182.000000
         170.000000
               1
    
    
      max
          29.000000
         995.000000
           1.000000
        1649.334414
        1532.000000
       99647.000000
            True

Top contributors



In [7]:

    
top_females = females[females["reputation"]> 400]
top_males = males[males["reputation"]> 400]

top_users = all_users[all_users["reputation"]> 400]



In [8]:

    
print top_females.median()
top_females.describe()









    



activity_freq      2.173913
days_active       28.000000
gender_cat         0.000000
lifetime         864.106413
max_interval     174.000000
reputation       726.000000
dead               1.000000
dtype: float64






    Out[8]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       61.000000
        61.000000
       61
         61.000000
        61.000000
          61.000000
              61
    
    
      mean
        2.494243
        54.295082
        0
        791.178752
       218.918033
        1487.803279
       0.6229508
    
    
      std
        1.743588
        94.715072
        0
        511.045778
       204.107185
        2274.934327
       0.4886694
    
    
      min
        1.000000
         1.000000
        0
          0.000000
         0.000000
         411.000000
           False
    
    
      25%
        1.535714
        14.000000
        0
        372.744458
        58.000000
         517.000000
               0
    
    
      50%
        2.173913
        28.000000
        0
        864.106413
       174.000000
         726.000000
               1
    
    
      75%
        2.740741
        56.000000
        0
       1208.436864
       353.000000
        1285.000000
               1
    
    
      max
       10.921053
       638.000000
        0
       1615.406364
       905.000000
       15164.000000
            True



In [9]:

    
print top_males.median()
top_males.describe()









    



activity_freq      1.852564
days_active       28.000000
gender_cat         1.000000
lifetime         890.464130
max_interval     178.000000
reputation       713.000000
dead               1.000000
dtype: float64






    Out[9]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       1525.000000
       1525.000000
       1525
       1525.000000
       1525.000000
        1525.000000
            1525
    
    
      mean
          2.224935
         52.252459
          1
        860.348298
        221.782295
        1690.312787
       0.6242623
    
    
      std
          1.371311
         83.849031
          0
        499.472963
        193.750736
        4874.398334
       0.4844717
    
    
      min
          1.000000
          1.000000
          1
          0.000000
          0.000000
         401.000000
           False
    
    
      25%
          1.482759
         14.000000
          1
        443.051386
         81.000000
         511.000000
               0
    
    
      50%
          1.852564
         28.000000
          1
        890.464130
        178.000000
         713.000000
               1
    
    
      75%
          2.465116
         55.000000
          1
       1308.857871
        302.000000
        1346.000000
               1
    
    
      max
         18.428571
        995.000000
          1
       1649.334414
       1126.000000
       99647.000000
            True



In [10]:

    
print top_users.median()
top_users.describe()









    



activity_freq      1.865766
days_active       28.000000
gender_cat         1.000000
lifetime         890.143155
max_interval     177.500000
reputation       716.500000
dead               1.000000
dtype: float64






    Out[10]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       1586.000000
       1586.000000
       1586.000000
       1586.000000
       1586.000000
        1586.00000
            1586
    
    
      mean
          2.235293
         52.331021
          0.961538
        857.687931
        221.672131
        1682.52396
       0.6242119
    
    
      std
          1.387764
         84.260476
          0.192368
        499.935586
        194.092618
        4800.28902
       0.4844785
    
    
      min
          1.000000
          1.000000
          0.000000
          0.000000
          0.000000
         401.00000
           False
    
    
      25%
          1.485065
         14.000000
          1.000000
        437.706032
         80.000000
         511.50000
               0
    
    
      50%
          1.865766
         28.000000
          1.000000
        890.143155
        177.500000
         716.50000
               1
    
    
      75%
          2.488644
         55.000000
          1.000000
       1306.842465
        302.750000
        1338.50000
               1
    
    
      max
         18.428571
        995.000000
          1.000000
       1649.334414
       1126.000000
       99647.00000
            True

Common women contributors



In [11]:

    
common_females = females[females["reputation"] <= 400]
common_males = males[males["reputation"] <= 400]

common_users = all_users[all_users["reputation"]<= 400]



In [12]:

    
print common_females.median()
common_females.describe()









    



activity_freq      1.285714
days_active        2.000000
gender_cat         0.000000
lifetime          15.068972
max_interval       0.000000
reputation       111.000000
dead               1.000000
dtype: float64






    Out[12]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       727.000000
       727.000000
       727
        727.000000
        727.000000
       727.000000
             727
    
    
      mean
         1.615860
         4.126547
         0
        240.949079
        108.786795
       133.656121
       0.8170564
    
    
      std
         0.940177
         5.572260
         0
        361.007830
        212.771719
        63.030995
       0.3868865
    
    
      min
         1.000000
         1.000000
         0
          0.000000
          0.000000
        51.000000
           False
    
    
      25%
         1.000000
         1.000000
         0
          0.000000
          0.000000
       101.000000
               1
    
    
      50%
         1.285714
         2.000000
         0
         15.068972
          0.000000
       111.000000
               1
    
    
      75%
         2.000000
         5.000000
         0
        395.414585
        132.500000
       141.000000
               1
    
    
      max
         9.000000
        57.000000
         0
       1509.208838
       1397.000000
       400.000000
            True



In [13]:

    
print common_males.median()
common_males.describe()









    



activity_freq      1.25000
days_active        2.00000
gender_cat         1.00000
lifetime          37.37005
max_interval       0.00000
reputation       115.00000
dead               1.00000
dtype: float64






    Out[13]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       16739.000000
       16739.000000
       16739
       16739.000000
       16739.000000
       16739.000000
           16739
    
    
      mean
           1.555607
           4.206344
           1
         270.775524
         112.987574
         137.670410
       0.8181492
    
    
      std
           0.897534
           5.366969
           0
         388.527346
         203.633635
          62.310956
        0.385733
    
    
      min
           1.000000
           1.000000
           1
           0.000000
           0.000000
          50.000000
           False
    
    
      25%
           1.000000
           1.000000
           1
           0.000000
           0.000000
         101.000000
               1
    
    
      50%
           1.250000
           2.000000
           1
          37.370050
           0.000000
         115.000000
               1
    
    
      75%
           1.857143
           5.000000
           1
         444.341879
         156.000000
         150.000000
               1
    
    
      max
          29.000000
          80.000000
           1
        1645.057552
        1532.000000
         400.000000
            True



In [14]:

    
print common_users.median()
common_users.describe()









    



activity_freq      1.250000
days_active        2.000000
gender_cat         1.000000
lifetime          36.386098
max_interval       0.000000
reputation       115.000000
dead               1.000000
dtype: float64






    Out[14]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       17466.000000
       17466.000000
       17466.000000
       17466.000000
       17466.000000
       17466.000000
           17466
    
    
      mean
           1.558115
           4.203023
           0.958376
         269.534036
         112.812722
         137.503321
       0.8181037
    
    
      std
           0.899402
           5.375529
           0.199734
         387.456979
         204.017554
          62.344425
         0.38577
    
    
      min
           1.000000
           1.000000
           0.000000
           0.000000
           0.000000
          50.000000
           False
    
    
      25%
           1.000000
           1.000000
           1.000000
           0.000000
           0.000000
         101.000000
               1
    
    
      50%
           1.250000
           2.000000
           1.000000
          36.386098
           0.000000
         115.000000
               1
    
    
      75%
           1.875000
           5.000000
           1.000000
         441.509633
         155.000000
         149.000000
               1
    
    
      max
          29.000000
          80.000000
           1.000000
        1645.057552
        1532.000000
         400.000000
            True

Third Question: Do women and men contribute for the same amount of time ?

Hypothesis 1: The survival time of users is the same among genders.

H0: survivalTime(Males) = survivalTime(Females);

H1: survivalTime(Males) != survivalTime(Females).

Data



In [15]:

    
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()

T = all_users["lifetime"] #measure in days
C = all_users["dead"]

females_ = all_users["gender"] == "Female"
males_ = all_users["gender"] == "Male"



In [16]:

    
fig = pyplot.figure(figsize=(12, 6)) 
ax = pyplot.subplot(111)

kmf.fit(T[females_], event_observed=C[females_], label="Female")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

kmf.fit(T[males_], event_observed=C[males_], label="Male")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

pyplot.ylim(0,1)
pyplot.title("Lifespans of users from different genders")









    



Median:  91.5551171644
Median:  143.891217859






    Out[16]:





<matplotlib.text.Text at 0x11362eed0>

Hypothesis test



In [17]:

    
from lifelines.statistics import logrank_test

summary, p_value, test_results = logrank_test(T[females_], T[males_], C[females_], C[males_], alpha=.95 )









    



Results
   df: 1
   alpha: 0.95
   t 0: -1
   test: logrank
   null distribution: chi squared

   __ p-value ___|__ test statistic __|__ test results __
         0.06454 |              3.417 |     None

Looking at the top contributors



In [18]:

    
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()

T = top_users["lifetime"] #measure in days
C = top_users["dead"]

females_ = top_users["gender"] == "Female"
males_ = top_users["gender"] == "Male"



In [19]:

    
fig = pyplot.figure(figsize=(12, 6)) 
ax = pyplot.subplot(111)

kmf.fit(T[females_], event_observed=C[females_], label="Female")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

kmf.fit(T[males_], event_observed=C[males_], label="Male")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

pyplot.ylim(0,1)
pyplot.title("Lifespans of users from different genders")









    



Median:  980.231270868
Median:  1113.77744398






    Out[19]:





<matplotlib.text.Text at 0x1132f66d0>

Hypothesis test



In [20]:

    
from lifelines.statistics import logrank_test

summary, p_value, test_results = logrank_test(T[females_], T[males_], C[females_], C[males_], alpha=.95 )









    



Results
   df: 1
   alpha: 0.95
   t 0: -1
   test: logrank
   null distribution: chi squared

   __ p-value ___|__ test statistic __|__ test results __
         0.52670 |              0.401 |     None

Looking at the common contributors



In [21]:

    
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()

T = common_users["lifetime"] #measure in days
C = common_users["dead"]

females_ = common_users["gender"] == "Female"
males_ = common_users["gender"] == "Male"



In [22]:

    
fig = pyplot.figure(figsize=(12, 6)) 
ax = pyplot.subplot(111)

kmf.fit(T[females_], event_observed=C[females_], label="Female")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

kmf.fit(T[males_], event_observed=C[males_], label="Male")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

pyplot.ylim(0,1)
pyplot.title("Lifespans of users from different genders")









    



Median:  52.1448481134
Median:  84.9269020833






    Out[22]:





<matplotlib.text.Text at 0x1132a91d0>

Hypotesis test



In [23]:

    
from lifelines.statistics import logrank_test

summary, p_value, test_results = logrank_test(T[females_], T[males_], C[females_], C[males_], alpha=.95 )









    



Results
   df: 1
   alpha: 0.95
   t 0: -1
   test: logrank
   null distribution: chi squared

   __ p-value ___|__ test statistic __|__ test results __
         0.09921 |              2.718 |     None

Hypothesis 2: The amount of activity days is the same between genders.

H0: daysActive(Males) = daysActive(Females);

H1: daysActive(Males) != daysActive(Females).

Correlation and Binomial Negative Regression



In [24]:

    
all_users.corr(method="spearman")









    Out[24]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      activity_freq
       1.000000
       0.425268
      -0.009161
       0.407633
       0.393553
       0.329942
      -0.130060
    
    
      days_active
       0.425268
       1.000000
       0.009320
       0.864971
       0.817889
       0.616885
      -0.252284
    
    
      gender_cat
      -0.009161
       0.009320
       1.000000
       0.014137
       0.010376
       0.017309
       0.002831
    
    
      lifetime
       0.407633
       0.864971
       0.014137
       1.000000
       0.828082
       0.543538
      -0.258530
    
    
      max_interval
       0.393553
       0.817889
       0.010376
       0.828082
       1.000000
       0.493821
      -0.225892
    
    
      reputation
       0.329942
       0.616885
       0.017309
       0.543538
       0.493821
       1.000000
      -0.087809
    
    
      dead
      -0.130060
      -0.252284
       0.002831
      -0.258530
      -0.225892
      -0.087809
       1.000000



In [25]:

    
exog = all_users["gender_cat"]
exog = sm.add_constant(exog, prepend=True)
endog = all_users["days_active"]

mod_nbin = sm.GLM(endog, exog, family=sm.families.NegativeBinomial())

res_nbin = mod_nbin.fit()

res_nbin.summary()









    Out[25]:





Generalized Linear Model Regression Results

  Dep. Variable:      days_active      No. Observations:       19052    


  Model:                  GLM          Df Residuals:           19050    


  Model Family:    NegativeBinomial    Df Model:                   1    


  Link Function:          log          Scale:              10.5053357719


  Method:                IRLS          Log-Likelihood:        -60278.   


  Date:            Sun, 05 Oct 2014    Deviance:               30689.   


  Time:                23:18:31        Pearson chi2:         2.00e+05   


  No. Iterations:          7                                            




                coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const           2.0807      0.122     16.991   0.000      1.841     2.321


  gender_cat      0.0256      0.125      0.205   0.838     -0.219     0.271

Data's shape



In [26]:

    
nobs = res_nbin.nobs

y = endog / endog.sum(1)

yhat = res_nbin.mu

pyplot.scatter(yhat, y)

res_nbin.params









    Out[26]:





const         2.080710
gender_cat    0.025627
dtype: float64

Looking at the top contributors



In [27]:

    
top_users.corr(method="spearman")









    Out[27]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      activity_freq
       1.000000
       0.245548
      -0.042593
      -0.115657
      -0.084766
       0.323587
      -0.150433
    
    
      days_active
       0.245548
       1.000000
       0.006048
       0.485886
       0.024067
       0.625802
      -0.443778
    
    
      gender_cat
      -0.042593
       0.006048
       1.000000
       0.026432
       0.011147
      -0.007394
       0.003344
    
    
      lifetime
      -0.115657
       0.485886
       0.026432
       1.000000
       0.607747
       0.255104
      -0.382232
    
    
      max_interval
      -0.084766
       0.024067
       0.011147
       0.607747
       1.000000
      -0.040356
      -0.055861
    
    
      reputation
       0.323587
       0.625802
      -0.007394
       0.255104
      -0.040356
       1.000000
      -0.224840
    
    
      dead
      -0.150433
      -0.443778
       0.003344
      -0.382232
      -0.055861
      -0.224840
       1.000000



In [28]:

    
exog = top_users["gender_cat"]
exog = sm.add_constant(exog, prepend=True)
endog = top_users["days_active"]

mod_nbin = sm.GLM(endog, exog, family=sm.families.NegativeBinomial())

res_nbin = mod_nbin.fit()

res_nbin.summary()









    Out[28]:





Generalized Linear Model Regression Results

  Dep. Variable:      days_active      No. Observations:        1586    


  Model:                  GLM          Df Residuals:            1584    


  Model Family:    NegativeBinomial    Df Model:                   1    


  Link Function:          log          Scale:              2.54415393463


  Method:                IRLS          Log-Likelihood:        -7877.8   


  Date:            Sun, 05 Oct 2014    Deviance:               1982.7   


  Time:                23:18:57        Pearson chi2:         4.03e+03   


  No. Iterations:          7                                            




                coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const           3.9944      0.206     19.381   0.000      3.590     4.398


  gender_cat     -0.0383      0.210     -0.182   0.855     -0.450     0.374

Data's shape



In [29]:

    
nobs = res_nbin.nobs

y = endog / endog.sum(1)

yhat = res_nbin.mu

pyplot.scatter(yhat, y)

res_nbin.params









    Out[29]:





const         3.994434
gender_cat   -0.038347
dtype: float64

Looking at the common contributors



In [30]:

    
common_users.corr(method="spearman")









    Out[30]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      activity_freq
       1.000000
       0.381664
      -0.008192
       0.393000
       0.380734
       0.262264
      -0.100779
    
    
      days_active
       0.381664
       1.000000
       0.008329
       0.862652
       0.827520
       0.523514
      -0.214981
    
    
      gender_cat
      -0.008192
       0.008329
       1.000000
       0.012498
       0.009938
       0.018110
       0.003460
    
    
      lifetime
       0.393000
       0.862652
       0.012498
       1.000000
       0.814070
       0.475928
      -0.222272
    
    
      max_interval
       0.380734
       0.827520
       0.009938
       0.814070
       1.000000
       0.442743
      -0.208311
    
    
      reputation
       0.262264
       0.523514
       0.018110
       0.475928
       0.442743
       1.000000
      -0.027771
    
    
      dead
      -0.100779
      -0.214981
       0.003460
      -0.222272
      -0.208311
      -0.027771
       1.000000



In [31]:

    
exog = common_users["gender_cat"]
exog = sm.add_constant(exog, prepend=True)
endog = common_users["days_active"]

mod_nbin = sm.GLM(endog, exog, family=sm.families.NegativeBinomial())

res_nbin = mod_nbin.fit()

res_nbin.summary()









    Out[31]:





Generalized Linear Model Regression Results

  Dep. Variable:      days_active      No. Observations:       17466   


  Model:                  GLM          Df Residuals:           17464   


  Model Family:    NegativeBinomial    Df Model:                   1   


  Link Function:          log          Scale:              1.3216245764


  Method:                IRLS          Log-Likelihood:        -44474.  


  Date:            Sun, 05 Oct 2014    Deviance:               12854.  


  Time:                23:20:02        Pearson chi2:         2.31e+04  


  No. Iterations:          6                                           




                coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const           1.4174      0.048     29.826   0.000      1.324     1.511


  gender_cat      0.0192      0.049      0.395   0.693     -0.076     0.114

Data's shape



In [32]:

    
nobs = res_nbin.nobs

y = endog / endog.sum(1)

yhat = res_nbin.mu

pyplot.scatter(yhat, y)

res_nbin.params









    Out[32]:





const         1.417441
gender_cat    0.019153
dtype: float64

Hypothesis 3: The frequency of participation is the same between genders.

H0: frequency(Males) = frequency(Females);

H1: frequency(Males) != frequency(Females).

Data



In [33]:

    
females_frequency = females['activity_freq']
males_frequency = males['activity_freq']

Data's summary



In [34]:

    
print "Female:"
print females_frequency.describe()
print "Median: ", females_frequency.median()

print 
print "Male:"
print males_frequency.describe()
print "Median: ", males_frequency.median()









    



Female:
count    788.000000
mean       1.683857
std        1.049937
min        1.000000
25%        1.000000
50%        1.333333
75%        2.000000
max       10.921053
dtype: float64
Median:  1.33333333333

Male:
count    18264.000000
mean         1.611495
std          0.964111
min          1.000000
25%          1.000000
50%          1.333333
75%          2.000000
max         29.000000
dtype: float64
Median:  1.33333333333

The data's shape



In [35]:

    
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
females_frequency.hist(ax=axes[0])
axes[0].set_title("Post Frequency by Females - Histogram")
males_frequency.hist(ax=axes[1])
axes[1].set_title("Post Frequency by Males - Histogram")









    Out[35]:





<matplotlib.text.Text at 0x10e857210>

Hypothesis test



In [36]:

    
print "Two-sample Chi-Square test: ", stats.chi2_contingency( females_frequency, males_frequency )[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_frequency, males_frequency)[1]









    



Two-sample Chi-Square test:  1.0
Two-sample Mann Whitney U test:  0.222733223006

Looking at the top contributors



In [37]:

    
top_females_frequency = top_females['activity_freq']
top_males_frequency = top_males['activity_freq']

Data's summary



In [38]:

    
print "Female:"
print top_females_frequency.describe()
print "Median: ", top_females_frequency.median()

print 
print "Male:"
print top_males_frequency.describe()
print "Median: ", top_males_frequency.median()









    



Female:
count    61.000000
mean      2.494243
std       1.743588
min       1.000000
25%       1.535714
50%       2.173913
75%       2.740741
max      10.921053
dtype: float64
Median:  2.17391304348

Male:
count    1525.000000
mean        2.224935
std         1.371311
min         1.000000
25%         1.482759
50%         1.852564
75%         2.465116
max        18.428571
dtype: float64
Median:  1.85256410256

The data's shape



In [39]:

    
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
top_females_frequency.hist(ax=axes[0])
axes[0].set_title("Post Frequency by Females - Histogram")
top_males_frequency.hist(ax=axes[1])
axes[1].set_title("Post Frequency by Males - Histogram")









    Out[39]:





<matplotlib.text.Text at 0x10e5edd90>

Hypotesis test



In [40]:

    
print "Two-sample Chi-Square test: ", stats.chi2_contingency( top_females_frequency, top_males_frequency )[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_frequency, top_males_frequency)[1]









    



Two-sample Chi-Square test:  1.0
Two-sample Mann Whitney U test:  0.0899671760473

Looking at the common contributors



In [41]:

    
common_females_frequency = common_females['activity_freq']
common_males_frequency = common_males['activity_freq']

Data's summary



In [42]:

    
print "Female:"
print common_females_frequency.describe()
print "Median: ", common_females_frequency.median()

print 
print "Male:"
print common_males_frequency.describe()
print "Median: ", common_males_frequency.median()









    



Female:
count    727.000000
mean       1.615860
std        0.940177
min        1.000000
25%        1.000000
50%        1.285714
75%        2.000000
max        9.000000
dtype: float64
Median:  1.28571428571

Male:
count    16739.000000
mean         1.555607
std          0.897534
min          1.000000
25%          1.000000
50%          1.250000
75%          1.857143
max         29.000000
dtype: float64
Median:  1.25

The data's shape



In [43]:

    
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
common_females_frequency.hist(ax=axes[0])
axes[0].set_title("Post Frequency by Females - Histogram")
common_males_frequency.hist(ax=axes[1])
axes[1].set_title("Post Frequency by Males - Histogram")









    Out[43]:





<matplotlib.text.Text at 0x10e5b9ed0>

Hypotesis test



In [44]:

    
print "Two-sample Chi-Square test: ", stats.chi2_contingency( common_females_frequency, common_males_frequency )[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_frequency, common_males_frequency)[1]









    



Two-sample Chi-Square test:  1.0
Two-sample Mann Whitney U test:  0.301601233707

Inicialization

Here you can find the data importing and some useful functions used for analysing the data. Please, run this first, otherwise the analysis will not work.

Importing the data from the MongoDB database and inserting into a panda dataframe for easy manipulation.



In [1]:

    
from __future__ import division
import pymongo, time, pylab, numpy, pandas, math
from scipy import stats
import matplotlib as mpl
from matplotlib import pyplot
import statsmodels.api as sm

%matplotlib inline
mpl.style.use('ggplot')
# pyplot.rcdefaults()

client = pymongo.MongoClient('localhost', 27017)

community = 'superuser'
stats_db = client[community].statistics

cursor = stats_db.find({'$or': [{'questions_total':{'$gt':0}}, {'answers_total':{'$gt':0}}, 
                                {'comments_total':{'$gt':0}}], 
                        'gender': {'$ne': "Unknown"} }, 
                       {u'_id': False, u'dates': True, u'reputation': True,
                        u'joined': True, u'gender':True,
                        'lifetime': True, 'max_interval': True, 'days_active': True, 
                        'gender_cat': True, 'activity_freq': True})

df =  pandas.DataFrame(list(cursor))



In [2]:

    
import datetime

def seen_death(row):
    recent = datetime.datetime(2014,1,20) - datetime.timedelta(days=int(df["max_interval"].mean()))
    return row["dates"][-1] < recent

df["dead"] = df.apply(seen_death, axis=1)



In [ ]:

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	788.000000	788.000000	788	788.000000	788.000000	788.000000	788
mean	1.683857	8.010152	0	283.543000	117.312183	238.482234	0.8020305
std	1.049937	29.875726	0	402.226189	214.023901	727.567127	0.3987222
min	1.000000	1.000000	0	0.000000	0.000000	51.000000	False
25%	1.000000	1.000000	0	0.000000	0.000000	101.000000	1
50%	1.333333	2.000000	0	46.670011	0.000000	116.000000	1
75%	2.000000	6.000000	0	479.477738	165.000000	156.000000	1
max	10.921053	638.000000	0	1615.406364	1397.000000	15164.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	18264.000000	18264.000000	18264	18264.000000	18264.000000	18264.000000	18264
mean	1.611495	8.218079	1	320.003430	122.071671	267.312308	0.8019601
std	0.964111	28.102552	0	431.008066	205.042662	1473.342909	0.3985333
min	1.000000	1.000000	1	0.000000	0.000000	50.000000	False
25%	1.000000	1.000000	1	0.000808	0.000000	101.000000	1
50%	1.333333	3.000000	1	75.875431	1.000000	118.000000	1
75%	2.000000	6.000000	1	547.338934	183.000000	170.000000	1
max	29.000000	995.000000	1	1649.334414	1532.000000	99647.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	19052.000000	19052.000000	19052.000000	19052.000000	19052.000000	19052.000000	19052
mean	1.614488	8.209479	0.958640	318.495409	121.874816	266.119882	0.801963
std	0.967889	28.177306	0.199128	429.907212	205.418269	1450.121524	0.3985306
min	1.000000	1.000000	0.000000	0.000000	0.000000	50.000000	False
25%	1.000000	1.000000	1.000000	0.000772	0.000000	101.000000	1
50%	1.333333	3.000000	1.000000	73.488142	1.000000	118.000000	1
75%	2.000000	6.000000	1.000000	545.616584	182.000000	170.000000	1
max	29.000000	995.000000	1.000000	1649.334414	1532.000000	99647.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	61.000000	61.000000	61	61.000000	61.000000	61.000000	61
mean	2.494243	54.295082	0	791.178752	218.918033	1487.803279	0.6229508
std	1.743588	94.715072	0	511.045778	204.107185	2274.934327	0.4886694
min	1.000000	1.000000	0	0.000000	0.000000	411.000000	False
25%	1.535714	14.000000	0	372.744458	58.000000	517.000000	0
50%	2.173913	28.000000	0	864.106413	174.000000	726.000000	1
75%	2.740741	56.000000	0	1208.436864	353.000000	1285.000000	1
max	10.921053	638.000000	0	1615.406364	905.000000	15164.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	1525.000000	1525.000000	1525	1525.000000	1525.000000	1525.000000	1525
mean	2.224935	52.252459	1	860.348298	221.782295	1690.312787	0.6242623
std	1.371311	83.849031	0	499.472963	193.750736	4874.398334	0.4844717
min	1.000000	1.000000	1	0.000000	0.000000	401.000000	False
25%	1.482759	14.000000	1	443.051386	81.000000	511.000000	0
50%	1.852564	28.000000	1	890.464130	178.000000	713.000000	1
75%	2.465116	55.000000	1	1308.857871	302.000000	1346.000000	1
max	18.428571	995.000000	1	1649.334414	1126.000000	99647.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	1586.000000	1586.000000	1586.000000	1586.000000	1586.000000	1586.00000	1586
mean	2.235293	52.331021	0.961538	857.687931	221.672131	1682.52396	0.6242119
std	1.387764	84.260476	0.192368	499.935586	194.092618	4800.28902	0.4844785
min	1.000000	1.000000	0.000000	0.000000	0.000000	401.00000	False
25%	1.485065	14.000000	1.000000	437.706032	80.000000	511.50000	0
50%	1.865766	28.000000	1.000000	890.143155	177.500000	716.50000	1
75%	2.488644	55.000000	1.000000	1306.842465	302.750000	1338.50000	1
max	18.428571	995.000000	1.000000	1649.334414	1126.000000	99647.00000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	727.000000	727.000000	727	727.000000	727.000000	727.000000	727
mean	1.615860	4.126547	0	240.949079	108.786795	133.656121	0.8170564
std	0.940177	5.572260	0	361.007830	212.771719	63.030995	0.3868865
min	1.000000	1.000000	0	0.000000	0.000000	51.000000	False
25%	1.000000	1.000000	0	0.000000	0.000000	101.000000	1
50%	1.285714	2.000000	0	15.068972	0.000000	111.000000	1
75%	2.000000	5.000000	0	395.414585	132.500000	141.000000	1
max	9.000000	57.000000	0	1509.208838	1397.000000	400.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	16739.000000	16739.000000	16739	16739.000000	16739.000000	16739.000000	16739
mean	1.555607	4.206344	1	270.775524	112.987574	137.670410	0.8181492
std	0.897534	5.366969	0	388.527346	203.633635	62.310956	0.385733
min	1.000000	1.000000	1	0.000000	0.000000	50.000000	False
25%	1.000000	1.000000	1	0.000000	0.000000	101.000000	1
50%	1.250000	2.000000	1	37.370050	0.000000	115.000000	1
75%	1.857143	5.000000	1	444.341879	156.000000	150.000000	1
max	29.000000	80.000000	1	1645.057552	1532.000000	400.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	17466.000000	17466.000000	17466.000000	17466.000000	17466.000000	17466.000000	17466
mean	1.558115	4.203023	0.958376	269.534036	112.812722	137.503321	0.8181037
std	0.899402	5.375529	0.199734	387.456979	204.017554	62.344425	0.38577
min	1.000000	1.000000	0.000000	0.000000	0.000000	50.000000	False
25%	1.000000	1.000000	1.000000	0.000000	0.000000	101.000000	1
50%	1.250000	2.000000	1.000000	36.386098	0.000000	115.000000	1
75%	1.875000	5.000000	1.000000	441.509633	155.000000	149.000000	1
max	29.000000	80.000000	1.000000	1645.057552	1532.000000	400.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
activity_freq	1.000000	0.425268	-0.009161	0.407633	0.393553	0.329942	-0.130060
days_active	0.425268	1.000000	0.009320	0.864971	0.817889	0.616885	-0.252284
gender_cat	-0.009161	0.009320	1.000000	0.014137	0.010376	0.017309	0.002831
lifetime	0.407633	0.864971	0.014137	1.000000	0.828082	0.543538	-0.258530
max_interval	0.393553	0.817889	0.010376	0.828082	1.000000	0.493821	-0.225892
reputation	0.329942	0.616885	0.017309	0.543538	0.493821	1.000000	-0.087809
dead	-0.130060	-0.252284	0.002831	-0.258530	-0.225892	-0.087809	1.000000

Dep. Variable:	days_active	No. Observations:	19052
Model:	GLM	Df Residuals:	19050
Model Family:	NegativeBinomial	Df Model:	1
Link Function:	log	Scale:	10.5053357719
Method:	IRLS	Log-Likelihood:	-60278.
Date:	Sun, 05 Oct 2014	Deviance:	30689.
Time:	23:18:31	Pearson chi2:	2.00e+05
No. Iterations:	7

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
const	2.0807	0.122	16.991	0.000	1.841 2.321
gender_cat	0.0256	0.125	0.205	0.838	-0.219 0.271

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
activity_freq	1.000000	0.245548	-0.042593	-0.115657	-0.084766	0.323587	-0.150433
days_active	0.245548	1.000000	0.006048	0.485886	0.024067	0.625802	-0.443778
gender_cat	-0.042593	0.006048	1.000000	0.026432	0.011147	-0.007394	0.003344
lifetime	-0.115657	0.485886	0.026432	1.000000	0.607747	0.255104	-0.382232
max_interval	-0.084766	0.024067	0.011147	0.607747	1.000000	-0.040356	-0.055861
reputation	0.323587	0.625802	-0.007394	0.255104	-0.040356	1.000000	-0.224840
dead	-0.150433	-0.443778	0.003344	-0.382232	-0.055861	-0.224840	1.000000

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
activity_freq	1.000000	0.381664	-0.008192	0.393000	0.380734	0.262264	-0.100779
days_active	0.381664	1.000000	0.008329	0.862652	0.827520	0.523514	-0.214981
gender_cat	-0.008192	0.008329	1.000000	0.012498	0.009938	0.018110	0.003460
lifetime	0.393000	0.862652	0.012498	1.000000	0.814070	0.475928	-0.222272
max_interval	0.380734	0.827520	0.009938	0.814070	1.000000	0.442743	-0.208311
reputation	0.262264	0.523514	0.018110	0.475928	0.442743	1.000000	-0.027771
dead	-0.100779	-0.214981	0.003460	-0.222272	-0.208311	-0.027771	1.000000