Community: Mathematics

Inicialization and importing data are at the end of this notebook. For better visualization of the analysis they were placed at the bottom, but it's necessary to run them first so the analysis work as expected. Click here to go there.

Data Segregation

All contributors



In [3]:

    
males = df[df['gender']=='Male']
females = df[df['gender']=='Female']
all_users = df.copy()



In [4]:

    
print females.median()
females.describe()









    



activity_freq      2.040816
days_active        6.000000
gender_cat         0.000000
lifetime          75.865015
max_interval      22.000000
reputation       116.000000
dead               1.000000
dtype: float64






    Out[4]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       633.000000
       633.000000
       633
        633.000000
        633.000000
         633.000000
             633
    
    
      mean
         2.530979
        15.314376
         0
        186.537137
         74.799368
         387.303318
       0.6650869
    
    
      std
         1.686036
        36.476333
         0
        256.678853
        126.462042
        1958.974331
       0.4723333
    
    
      min
         1.000000
         1.000000
         0
          0.000000
          0.000000
          50.000000
           False
    
    
      25%
         1.500000
         2.000000
         0
          5.326962
          1.000000
          83.000000
               0
    
    
      50%
         2.040816
         6.000000
         0
         75.865015
         22.000000
         116.000000
               1
    
    
      75%
         3.000000
        14.000000
         0
        259.603018
        100.000000
         198.000000
               1
    
    
      max
        17.000000
       569.000000
         0
       1260.112666
       1107.000000
       45388.000000
            True



In [5]:

    
print males.median()
males.describe()









    



activity_freq      2.000000
days_active        4.000000
gender_cat         1.000000
lifetime          72.137721
max_interval      16.000000
reputation       121.000000
dead               1.000000
dtype: float64






    Out[5]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       7492.000000
       7492.000000
       7492
       7492.000000
       7492.000000
         7492.000000
            7492
    
    
      mean
          2.319418
         22.322744
          1
        235.644970
         87.263348
          868.969434
       0.6833956
    
    
      std
          1.680059
         73.250327
          0
        317.139542
        144.647498
         6045.880132
       0.4651827
    
    
      min
          1.000000
          1.000000
          1
          0.000000
          0.000000
           50.000000
           False
    
    
      25%
          1.264516
          1.000000
          1
          0.181954
          0.000000
          101.000000
               0
    
    
      50%
          2.000000
          4.000000
          1
         72.137721
         16.000000
          121.000000
               1
    
    
      75%
          2.818182
         12.000000
          1
        377.796521
        121.000000
          242.250000
               1
    
    
      max
         35.980861
       1140.000000
          1
       1275.221344
       1184.000000
       221779.000000
            True



In [6]:

    
print all_users.median()
all_users.describe()









    



activity_freq      2.000000
days_active        4.000000
gender_cat         1.000000
lifetime          72.748648
max_interval      17.000000
reputation       121.000000
dead               1.000000
dtype: float64






    Out[6]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       8125.000000
       8125.000000
       8125.000000
       8125.000000
       8125.000000
         8125.000000
            8125
    
    
      mean
          2.335900
         21.776738
          0.922092
        231.819092
         86.292308
          831.443938
       0.6819692
    
    
      std
          1.681378
         71.095517
          0.268043
        313.112478
        143.345526
         5832.648819
       0.4657402
    
    
      min
          1.000000
          1.000000
          0.000000
          0.000000
          0.000000
           50.000000
           False
    
    
      25%
          1.300000
          1.000000
          1.000000
          0.265104
          0.000000
          101.000000
               0
    
    
      50%
          2.000000
          4.000000
          1.000000
         72.748648
         17.000000
          121.000000
               1
    
    
      75%
          2.839450
         13.000000
          1.000000
        367.532617
        118.000000
          238.000000
               1
    
    
      max
         35.980861
       1140.000000
          1.000000
       1275.221344
       1184.000000
       221779.000000
            True

Top contributors



In [7]:

    
top_females = females[females["reputation"]> 300]
top_males = males[males["reputation"]> 300]

top_users = all_users[all_users["reputation"]> 300]



In [8]:

    
print top_females.median()
top_females.describe()









    



activity_freq      2.500000
days_active       28.000000
gender_cat         0.000000
lifetime         296.982965
max_interval      70.000000
reputation       504.000000
dead               0.000000
dtype: float64






    Out[8]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       123.000000
       123.000000
       123
        123.000000
       123.000000
         123.000000
             123
    
    
      mean
         2.875506
        51.341463
         0
        385.739611
       114.186992
        1524.268293
        0.398374
    
    
      std
         1.704451
        70.958523
         0
        314.810125
       148.115780
        4272.204505
       0.4915655
    
    
      min
         1.000000
         1.000000
         0
          0.000000
         0.000000
         302.000000
           False
    
    
      25%
         1.857143
        16.500000
         0
        131.991543
        32.500000
         363.500000
               0
    
    
      50%
         2.500000
        28.000000
         0
        296.982965
        70.000000
         504.000000
               0
    
    
      75%
         3.223819
        54.500000
         0
        580.129796
       135.500000
        1341.500000
               1
    
    
      max
        13.727273
       569.000000
         0
       1260.112666
       960.000000
       45388.000000
            True



In [9]:

    
print top_males.median()
top_males.describe()









    



activity_freq      2.390000
days_active       36.000000
gender_cat         1.000000
lifetime         477.649125
max_interval      93.000000
reputation       751.000000
dead               0.000000
dtype: float64






    Out[9]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       1683.000000
       1683.000000
       1683
       1683.000000
       1683.000000
         1683.000000
            1683
    
    
      mean
          2.957868
         83.792632
          1
        531.179951
        129.666667
         3451.025550
       0.3808675
    
    
      std
          2.209395
        137.517407
          0
        364.705040
        134.824275
        12416.953799
       0.4857444
    
    
      min
          1.000000
          1.000000
          1
          0.000000
          0.000000
          301.000000
           False
    
    
      25%
          1.826087
         18.000000
          1
        217.543984
         36.000000
          421.500000
               0
    
    
      50%
          2.390000
         36.000000
          1
        477.649125
         93.000000
          751.000000
               0
    
    
      75%
          3.333333
         82.000000
          1
        810.023356
        174.000000
         1965.000000
               1
    
    
      max
         35.980861
       1140.000000
          1
       1275.221344
       1131.000000
       221779.000000
            True



In [10]:

    
print top_users.median()
top_users.describe()









    



activity_freq      2.397698
days_active       35.000000
gender_cat         1.000000
lifetime         466.897788
max_interval      91.000000
reputation       730.500000
dead               0.000000
dtype: float64






    Out[10]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       1806.000000
       1806.000000
       1806.000000
       1806.000000
       1806.000000
         1806.000000
            1806
    
    
      mean
          2.952259
         81.582503
          0.931894
        521.274546
        128.612403
         3319.801218
       0.3820598
    
    
      std
          2.178434
        134.274196
          0.251998
        363.301187
        135.782608
        12047.556828
       0.4860256
    
    
      min
          1.000000
          1.000000
          0.000000
          0.000000
          0.000000
          301.000000
           False
    
    
      25%
          1.830285
         18.000000
          1.000000
        207.021883
         36.000000
          414.250000
               0
    
    
      50%
          2.397698
         35.000000
          1.000000
        466.897788
         91.000000
          730.500000
               0
    
    
      75%
          3.321547
         80.000000
          1.000000
        794.866069
        172.000000
         1884.750000
               1
    
    
      max
         35.980861
       1140.000000
          1.000000
       1275.221344
       1131.000000
       221779.000000
            True

Common women contributors



In [11]:

    
common_females = females[females["reputation"] <= 300]
common_males = males[males["reputation"] <= 300]

common_users = all_users[all_users["reputation"]<= 300]



In [12]:

    
print common_females.median()
common_females.describe()









    



activity_freq      2.000000
days_active        4.000000
gender_cat         0.000000
lifetime          46.969315
max_interval      13.000000
reputation       105.000000
dead               1.000000
dtype: float64






    Out[12]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       510.000000
       510.000000
       510
        510.000000
        510.000000
       510.000000
             510
    
    
      mean
         2.447888
         6.625490
         0
        138.494187
         65.300000
       113.094118
       0.7294118
    
    
      std
         1.672643
         7.476941
         0
        214.823293
        118.885138
        50.373253
       0.4446999
    
    
      min
         1.000000
         1.000000
         0
          0.000000
          0.000000
        50.000000
           False
    
    
      25%
         1.333333
         2.000000
         0
          2.013373
          0.000000
        74.000000
               0
    
    
      50%
         2.000000
         4.000000
         0
         46.969315
         13.000000
       105.000000
               1
    
    
      75%
         3.000000
         9.000000
         0
        190.355549
         82.500000
       136.750000
               1
    
    
      max
        17.000000
        85.000000
         0
       1164.055197
       1107.000000
       298.000000
            True



In [13]:

    
print common_males.median()
common_males.describe()









    



activity_freq      1.833333
days_active        2.000000
gender_cat         1.000000
lifetime          15.055017
max_interval       1.000000
reputation       111.000000
dead               1.000000
dtype: float64






    Out[13]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       5809.000000
       5809.000000
       5809
       5809.000000
       5809.000000
       5809.000000
            5809
    
    
      mean
          2.134444
          4.513514
          1
        150.021735
         74.978137
        120.888793
       0.7710449
    
    
      std
          1.440316
          5.623874
          0
        241.993352
        145.084698
         46.529120
       0.4201964
    
    
      min
          1.000000
          1.000000
          1
          0.000000
          0.000000
         50.000000
           False
    
    
      25%
          1.000000
          1.000000
          1
          0.029319
          0.000000
        101.000000
               1
    
    
      50%
          1.833333
          2.000000
          1
         15.055017
          1.000000
        111.000000
               1
    
    
      75%
          2.583333
          6.000000
          1
        216.979772
         86.000000
        136.000000
               1
    
    
      max
         20.000000
         81.000000
          1
       1258.932369
       1184.000000
        300.000000
            True



In [14]:

    
print common_users.median()
common_users.describe()









    



activity_freq      1.875000
days_active        3.000000
gender_cat         1.000000
lifetime          18.297742
max_interval       2.000000
reputation       111.000000
dead               1.000000
dtype: float64






    Out[14]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       6319.000000
       6319.000000
       6319.000000
       6319.000000
       6319.000000
       6319.000000
            6319
    
    
      mean
          2.159742
          4.683969
          0.919291
        149.091359
         74.197025
        120.259693
       0.7676848
    
    
      std
          1.462784
          5.823209
          0.272409
        239.919657
        143.164312
         46.894939
       0.4223424
    
    
      min
          1.000000
          1.000000
          0.000000
          0.000000
          0.000000
         50.000000
           False
    
    
      25%
          1.000000
          1.000000
          1.000000
          0.034384
          0.000000
        101.000000
               1
    
    
      50%
          1.875000
          3.000000
          1.000000
         18.297742
          2.000000
        111.000000
               1
    
    
      75%
          2.625000
          6.000000
          1.000000
        210.994248
         85.000000
        136.000000
               1
    
    
      max
         20.000000
         85.000000
          1.000000
       1258.932369
       1184.000000
        300.000000
            True

Third Question: Do women and men contribute for the same amount of time ?

Hypothesis 1: The survival time of users is the same among genders.

H0: survivalTime(Males) = survivalTime(Females);

H1: survivalTime(Males) != survivalTime(Females).

Data



In [15]:

    
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()

T = all_users["lifetime"] #measure in days
C = all_users["dead"]

females_ = all_users["gender"] == "Female"
males_ = all_users["gender"] == "Male"



In [16]:

    
fig = pyplot.figure(figsize=(12, 6)) 
ax = pyplot.subplot(111)

kmf.fit(T[females_], event_observed=C[females_], label="Female")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

kmf.fit(T[males_], event_observed=C[males_], label="Male")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

pyplot.ylim(0,1)
pyplot.title("Lifespans of users from different genders")









    



Median:  133.15021956
Median:  154.908315359






    Out[16]:





<matplotlib.text.Text at 0x10fc0f110>

Hypothesis test



In [17]:

    
from lifelines.statistics import logrank_test

summary, p_value, test_results = logrank_test(T[females_], T[males_], C[females_], C[males_], alpha=.95 )









    



Results
   df: 1
   alpha: 0.95
   t 0: -1
   test: logrank
   null distribution: chi squared

   __ p-value ___|__ test statistic __|__ test results __
         0.55255 |              0.353 |     None

Looking at the top contributors



In [18]:

    
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()

T = top_users["lifetime"] #measure in days
C = top_users["dead"]

females_ = top_users["gender"] == "Female"
males_ = top_users["gender"] == "Male"



In [19]:

    
fig = pyplot.figure(figsize=(12, 6)) 
ax = pyplot.subplot(111)

kmf.fit(T[females_], event_observed=C[females_], label="Female")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

kmf.fit(T[males_], event_observed=C[males_], label="Male")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

pyplot.ylim(0,1)
pyplot.title("Lifespans of users from different genders")









    



Median:  664.479191736
Median:  965.971062222






    Out[19]:





<matplotlib.text.Text at 0x110125650>

Hypothesis test



In [20]:

    
from lifelines.statistics import logrank_test

summary, p_value, test_results = logrank_test(T[females_], T[males_], C[females_], C[males_], alpha=.95 )









    



Results
   df: 1
   alpha: 0.95
   t 0: -1
   test: logrank
   null distribution: chi squared

   __ p-value ___|__ test statistic __|__ test results __
         0.00899 |              6.824 |     True

Looking at the common contributors



In [21]:

    
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()

T = common_users["lifetime"] #measure in days
C = common_users["dead"]

females_ = common_users["gender"] == "Female"
males_ = common_users["gender"] == "Male"



In [22]:

    
fig = pyplot.figure(figsize=(12, 6)) 
ax = pyplot.subplot(111)

kmf.fit(T[females_], event_observed=C[females_], label="Female")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

kmf.fit(T[males_], event_observed=C[males_], label="Male")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

pyplot.ylim(0,1)
pyplot.title("Lifespans of users from different genders")









    



Median:  70.6415111921
Median:  32.8633929398






    Out[22]:





<matplotlib.text.Text at 0x10f6e5b10>

Hypotesis test



In [23]:

    
from lifelines.statistics import logrank_test

summary, p_value, test_results = logrank_test(T[females_], T[males_], C[females_], C[males_], alpha=.95 )









    



Results
   df: 1
   alpha: 0.95
   t 0: -1
   test: logrank
   null distribution: chi squared

   __ p-value ___|__ test statistic __|__ test results __
         0.06530 |              3.397 |     None

Hypothesis 2: The amount of activity days is the same between genders.

H0: daysActive(Males) = daysActive(Females);

H1: daysActive(Males) != daysActive(Females).

Correlation and Binomial Negative Regression



In [28]:

    
all_users.corr(method="spearman")









    Out[28]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      activity_freq
       1.000000
       0.345089
      -0.050289
       0.256427
       0.225526
       0.247110
      -0.146746
    
    
      days_active
       0.345089
       1.000000
      -0.055666
       0.825906
       0.737583
       0.622819
      -0.443087
    
    
      gender_cat
      -0.050289
      -0.055666
       1.000000
      -0.004907
      -0.024281
       0.054600
       0.010345
    
    
      lifetime
       0.256427
       0.825906
      -0.004907
       1.000000
       0.863469
       0.543556
      -0.381632
    
    
      max_interval
       0.225526
       0.737583
      -0.024281
       0.863469
       1.000000
       0.433997
      -0.320157
    
    
      reputation
       0.247110
       0.622819
       0.054600
       0.543556
       0.433997
       1.000000
      -0.291969
    
    
      dead
      -0.146746
      -0.443087
       0.010345
      -0.381632
      -0.320157
      -0.291969
       1.000000



In [29]:

    
exog = all_users["gender_cat"]
exog = sm.add_constant(exog, prepend=True)
endog = all_users["days_active"]

mod_nbin = sm.GLM(endog, exog, family=sm.families.NegativeBinomial())

res_nbin = mod_nbin.fit()

res_nbin.summary()









    Out[29]:





Generalized Linear Model Regression Results

  Dep. Variable:      days_active      No. Observations:        8125    


  Model:                  GLM          Df Residuals:            8123    


  Model Family:    NegativeBinomial    Df Model:                   1    


  Link Function:          log          Scale:              9.91852947776


  Method:                IRLS          Log-Likelihood:        -33305.   


  Date:            Sun, 05 Oct 2014    Deviance:               21191.   


  Time:                23:07:09        Pearson chi2:         8.06e+04   


  No. Iterations:          8                                            




                coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const           2.7288      0.129     21.121   0.000      2.476     2.982


  gender_cat      0.3768      0.134      2.803   0.005      0.113     0.640

Data's shape



In [30]:

    
nobs = res_nbin.nobs

y = endog / endog.sum(1)

yhat = res_nbin.mu

pyplot.scatter(yhat, y)

res_nbin.params









    Out[30]:





const         2.728792
gender_cat    0.376814
dtype: float64

Looking at the top contributors



In [31]:

    
top_users.corr(method="spearman")









    Out[31]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      activity_freq
       1.000000
       0.268531
      -0.011003
      -0.205129
      -0.268695
       0.318475
      -0.112084
    
    
      days_active
       0.268531
       1.000000
       0.059173
       0.523422
      -0.044783
       0.818487
      -0.404213
    
    
      gender_cat
      -0.011003
       0.059173
       1.000000
       0.101702
       0.046823
       0.079837
      -0.016184
    
    
      lifetime
      -0.205129
       0.523422
       0.101702
       1.000000
       0.598659
       0.415470
      -0.255136
    
    
      max_interval
      -0.268695
      -0.044783
       0.046823
       0.598659
       1.000000
      -0.085554
      -0.043287
    
    
      reputation
       0.318475
       0.818487
       0.079837
       0.415470
      -0.085554
       1.000000
      -0.311134
    
    
      dead
      -0.112084
      -0.404213
      -0.016184
      -0.255136
      -0.043287
      -0.311134
       1.000000



In [32]:

    
exog = top_users["gender_cat"]
exog = sm.add_constant(exog, prepend=True)
endog = top_users["days_active"]

mod_nbin = sm.GLM(endog, exog, family=sm.families.NegativeBinomial())

res_nbin = mod_nbin.fit()

res_nbin.summary()









    Out[32]:





Generalized Linear Model Regression Results

  Dep. Variable:      days_active      No. Observations:        1806    


  Model:                  GLM          Df Residuals:            1804    


  Model Family:    NegativeBinomial    Df Model:                   1    


  Link Function:          log          Scale:              2.60836403649


  Method:                IRLS          Log-Likelihood:        -9754.5   


  Date:            Sun, 05 Oct 2014    Deviance:               2600.2   


  Time:                23:07:31        Pearson chi2:         4.71e+03   


  No. Iterations:          7                                            




                coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const           3.9385      0.147     26.786   0.000      3.650     4.227


  gender_cat      0.4898      0.152      3.217   0.001      0.191     0.788

Data's shape



In [33]:

    
nobs = res_nbin.nobs

y = endog / endog.sum(1)

yhat = res_nbin.mu

pyplot.scatter(yhat, y)

res_nbin.params









    Out[33]:





const         3.938499
gender_cat    0.489846
dtype: float64

Looking at the common contributors



In [34]:

    
common_users.corr(method="spearman")









    Out[34]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      activity_freq
       1.000000
       0.261831
      -0.067385
       0.262435
       0.239374
       0.086382
      -0.059026
    
    
      days_active
       0.261831
       1.000000
      -0.112767
       0.822551
       0.799398
       0.260494
      -0.298644
    
    
      gender_cat
      -0.067385
      -0.112767
       1.000000
      -0.046555
      -0.059154
       0.059909
       0.027977
    
    
      lifetime
       0.262435
       0.822551
      -0.046555
       1.000000
       0.857933
       0.308289
      -0.262599
    
    
      max_interval
       0.239374
       0.799398
      -0.059154
       0.857933
       1.000000
       0.270982
      -0.258137
    
    
      reputation
       0.086382
       0.260494
       0.059909
       0.308289
       0.270982
       1.000000
      -0.054298
    
    
      dead
      -0.059026
      -0.298644
       0.027977
      -0.262599
      -0.258137
      -0.054298
       1.000000



In [35]:

    
exog = common_users["gender_cat"]
exog = sm.add_constant(exog, prepend=True)
endog = common_users["days_active"]

mod_nbin = sm.GLM(endog, exog, family=sm.families.NegativeBinomial())

res_nbin = mod_nbin.fit()

res_nbin.summary()









    Out[35]:





Generalized Linear Model Regression Results

  Dep. Variable:      days_active      No. Observations:        6319    


  Model:                  GLM          Df Residuals:            6317    


  Model Family:    NegativeBinomial    Df Model:                   1    


  Link Function:          log          Scale:              1.25770015493


  Method:                IRLS          Log-Likelihood:        -16675.   


  Date:            Sun, 05 Oct 2014    Deviance:               4840.6   


  Time:                23:07:32        Pearson chi2:         7.94e+03   


  No. Iterations:          6                                            




                coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const           1.8909      0.053     35.493   0.000      1.787     1.995


  gender_cat     -0.3838      0.056     -6.891   0.000     -0.493    -0.275

Data's shape



In [36]:

    
nobs = res_nbin.nobs

y = endog / endog.sum(1)

yhat = res_nbin.mu

pyplot.scatter(yhat, y)

res_nbin.params









    Out[36]:





const         1.890924
gender_cat   -0.383848
dtype: float64

Hypothesis 3: The frequency of participation is the same between genders.

H0: frequency(Males) = frequency(Females);

H1: frequency(Males) != frequency(Females).

Data



In [37]:

    
females_frequency = females['activity_freq']
males_frequency = males['activity_freq']

Data's summary



In [38]:

    
print "Female:"
print females_frequency.describe()
print "Median: ", females_frequency.median()

print 
print "Male:"
print males_frequency.describe()
print "Median: ", males_frequency.median()









    



Female:
count    633.000000
mean       2.530979
std        1.686036
min        1.000000
25%        1.500000
50%        2.040816
75%        3.000000
max       17.000000
dtype: float64
Median:  2.04081632653

Male:
count    7492.000000
mean        2.319418
std         1.680059
min         1.000000
25%         1.264516
50%         2.000000
75%         2.818182
max        35.980861
dtype: float64
Median:  2.0

The data's shape



In [39]:

    
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
females_frequency.hist(ax=axes[0])
axes[0].set_title("Post Frequency by Females - Histogram")
males_frequency.hist(ax=axes[1])
axes[1].set_title("Post Frequency by Males - Histogram")









    Out[39]:





<matplotlib.text.Text at 0x114d24690>

Hypothesis test



In [40]:

    
print "Two-sample Chi-Square test: ", stats.chi2_contingency( females_frequency, males_frequency )[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_frequency, males_frequency)[1]









    



Two-sample Chi-Square test:  1.0
Two-sample Mann Whitney U test:  6.346823612e-06

Looking at the top contributors



In [41]:

    
top_females_frequency = top_females['activity_freq']
top_males_frequency = top_males['activity_freq']

Data's summary



In [42]:

    
print "Female:"
print top_females_frequency.describe()
print "Median: ", top_females_frequency.median()

print 
print "Male:"
print top_males_frequency.describe()
print "Median: ", top_males_frequency.median()









    



Female:
count    123.000000
mean       2.875506
std        1.704451
min        1.000000
25%        1.857143
50%        2.500000
75%        3.223819
max       13.727273
dtype: float64
Median:  2.5

Male:
count    1683.000000
mean        2.957868
std         2.209395
min         1.000000
25%         1.826087
50%         2.390000
75%         3.333333
max        35.980861
dtype: float64
Median:  2.39

The data's shape



In [43]:

    
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
top_females_frequency.hist(ax=axes[0])
axes[0].set_title("Post Frequency by Females - Histogram")
top_males_frequency.hist(ax=axes[1])
axes[1].set_title("Post Frequency by Males - Histogram")









    Out[43]:





<matplotlib.text.Text at 0x11529eb50>

Hypotesis test



In [44]:

    
print "Two-sample Chi-Square test: ", stats.chi2_contingency( top_females_frequency, top_males_frequency )[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_frequency, top_males_frequency)[1]









    



Two-sample Chi-Square test:  1.0
Two-sample Mann Whitney U test:  0.640221385367

Looking at the common contributors



In [45]:

    
common_females_frequency = common_females['activity_freq']
common_males_frequency = common_males['activity_freq']

Data's summary



In [46]:

    
print "Female:"
print common_females_frequency.describe()
print "Median: ", common_females_frequency.median()

print 
print "Male:"
print common_males_frequency.describe()
print "Median: ", common_males_frequency.median()









    



Female:
count    510.000000
mean       2.447888
std        1.672643
min        1.000000
25%        1.333333
50%        2.000000
75%        3.000000
max       17.000000
dtype: float64
Median:  2.0

Male:
count    5809.000000
mean        2.134444
std         1.440316
min         1.000000
25%         1.000000
50%         1.833333
75%         2.583333
max        20.000000
dtype: float64
Median:  1.83333333333

The data's shape



In [47]:

    
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
common_females_frequency.hist(ax=axes[0])
axes[0].set_title("Post Frequency by Females - Histogram")
common_males_frequency.hist(ax=axes[1])
axes[1].set_title("Post Frequency by Males - Histogram")









    Out[47]:





<matplotlib.text.Text at 0x114c09a90>

Hypotesis test



In [48]:

    
print "Two-sample Chi-Square test: ", stats.chi2_contingency( common_females_frequency, common_males_frequency )[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_frequency, common_males_frequency)[1]









    



Two-sample Chi-Square test:  1.0
Two-sample Mann Whitney U test:  1.09573723961e-07

Inicialization

Here you can find the data importing and some useful functions used for analysing the data. Please, run this first, otherwise the analysis will not work.

Importing the data from the MongoDB database and inserting into a panda dataframe for easy manipulation.



In [1]:

    
from __future__ import division
import pymongo, time, pylab, numpy, pandas, math
from scipy import stats
import matplotlib as mpl
from matplotlib import pyplot
import statsmodels.api as sm

%matplotlib inline
mpl.style.use('ggplot')
# pyplot.rcdefaults()

client = pymongo.MongoClient('localhost', 27017)

community = 'math'
stats_db = client[community].statistics

cursor = stats_db.find({'$or': [{'questions_total':{'$gt':0}}, {'answers_total':{'$gt':0}}, 
                                {'comments_total':{'$gt':0}}], 
                        'gender': {'$ne': "Unknown"} }, 
                       {u'_id': False, u'dates': True, u'reputation': True,
                        u'joined': True, u'gender':True,
                        'lifetime': True, 'max_interval': True, 'days_active': True, 
                        'gender_cat': True, 'activity_freq': True})

df =  pandas.DataFrame(list(cursor))



In [2]:

    
import datetime

def seen_death(row):
    recent = datetime.datetime(2014,1,20) - datetime.timedelta(days=int(df["max_interval"].mean()))
    return row["dates"][-1] < recent

df["dead"] = df.apply(seen_death, axis=1)



In [ ]:

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	633.000000	633.000000	633	633.000000	633.000000	633.000000	633
mean	2.530979	15.314376	0	186.537137	74.799368	387.303318	0.6650869
std	1.686036	36.476333	0	256.678853	126.462042	1958.974331	0.4723333
min	1.000000	1.000000	0	0.000000	0.000000	50.000000	False
25%	1.500000	2.000000	0	5.326962	1.000000	83.000000	0
50%	2.040816	6.000000	0	75.865015	22.000000	116.000000	1
75%	3.000000	14.000000	0	259.603018	100.000000	198.000000	1
max	17.000000	569.000000	0	1260.112666	1107.000000	45388.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	7492.000000	7492.000000	7492	7492.000000	7492.000000	7492.000000	7492
mean	2.319418	22.322744	1	235.644970	87.263348	868.969434	0.6833956
std	1.680059	73.250327	0	317.139542	144.647498	6045.880132	0.4651827
min	1.000000	1.000000	1	0.000000	0.000000	50.000000	False
25%	1.264516	1.000000	1	0.181954	0.000000	101.000000	0
50%	2.000000	4.000000	1	72.137721	16.000000	121.000000	1
75%	2.818182	12.000000	1	377.796521	121.000000	242.250000	1
max	35.980861	1140.000000	1	1275.221344	1184.000000	221779.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	8125.000000	8125.000000	8125.000000	8125.000000	8125.000000	8125.000000	8125
mean	2.335900	21.776738	0.922092	231.819092	86.292308	831.443938	0.6819692
std	1.681378	71.095517	0.268043	313.112478	143.345526	5832.648819	0.4657402
min	1.000000	1.000000	0.000000	0.000000	0.000000	50.000000	False
25%	1.300000	1.000000	1.000000	0.265104	0.000000	101.000000	0
50%	2.000000	4.000000	1.000000	72.748648	17.000000	121.000000	1
75%	2.839450	13.000000	1.000000	367.532617	118.000000	238.000000	1
max	35.980861	1140.000000	1.000000	1275.221344	1184.000000	221779.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	123.000000	123.000000	123	123.000000	123.000000	123.000000	123
mean	2.875506	51.341463	0	385.739611	114.186992	1524.268293	0.398374
std	1.704451	70.958523	0	314.810125	148.115780	4272.204505	0.4915655
min	1.000000	1.000000	0	0.000000	0.000000	302.000000	False
25%	1.857143	16.500000	0	131.991543	32.500000	363.500000	0
50%	2.500000	28.000000	0	296.982965	70.000000	504.000000	0
75%	3.223819	54.500000	0	580.129796	135.500000	1341.500000	1
max	13.727273	569.000000	0	1260.112666	960.000000	45388.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	1683.000000	1683.000000	1683	1683.000000	1683.000000	1683.000000	1683
mean	2.957868	83.792632	1	531.179951	129.666667	3451.025550	0.3808675
std	2.209395	137.517407	0	364.705040	134.824275	12416.953799	0.4857444
min	1.000000	1.000000	1	0.000000	0.000000	301.000000	False
25%	1.826087	18.000000	1	217.543984	36.000000	421.500000	0
50%	2.390000	36.000000	1	477.649125	93.000000	751.000000	0
75%	3.333333	82.000000	1	810.023356	174.000000	1965.000000	1
max	35.980861	1140.000000	1	1275.221344	1131.000000	221779.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	1806.000000	1806.000000	1806.000000	1806.000000	1806.000000	1806.000000	1806
mean	2.952259	81.582503	0.931894	521.274546	128.612403	3319.801218	0.3820598
std	2.178434	134.274196	0.251998	363.301187	135.782608	12047.556828	0.4860256
min	1.000000	1.000000	0.000000	0.000000	0.000000	301.000000	False
25%	1.830285	18.000000	1.000000	207.021883	36.000000	414.250000	0
50%	2.397698	35.000000	1.000000	466.897788	91.000000	730.500000	0
75%	3.321547	80.000000	1.000000	794.866069	172.000000	1884.750000	1
max	35.980861	1140.000000	1.000000	1275.221344	1131.000000	221779.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	510.000000	510.000000	510	510.000000	510.000000	510.000000	510
mean	2.447888	6.625490	0	138.494187	65.300000	113.094118	0.7294118
std	1.672643	7.476941	0	214.823293	118.885138	50.373253	0.4446999
min	1.000000	1.000000	0	0.000000	0.000000	50.000000	False
25%	1.333333	2.000000	0	2.013373	0.000000	74.000000	0
50%	2.000000	4.000000	0	46.969315	13.000000	105.000000	1
75%	3.000000	9.000000	0	190.355549	82.500000	136.750000	1
max	17.000000	85.000000	0	1164.055197	1107.000000	298.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	5809.000000	5809.000000	5809	5809.000000	5809.000000	5809.000000	5809
mean	2.134444	4.513514	1	150.021735	74.978137	120.888793	0.7710449
std	1.440316	5.623874	0	241.993352	145.084698	46.529120	0.4201964
min	1.000000	1.000000	1	0.000000	0.000000	50.000000	False
25%	1.000000	1.000000	1	0.029319	0.000000	101.000000	1
50%	1.833333	2.000000	1	15.055017	1.000000	111.000000	1
75%	2.583333	6.000000	1	216.979772	86.000000	136.000000	1
max	20.000000	81.000000	1	1258.932369	1184.000000	300.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	6319.000000	6319.000000	6319.000000	6319.000000	6319.000000	6319.000000	6319
mean	2.159742	4.683969	0.919291	149.091359	74.197025	120.259693	0.7676848
std	1.462784	5.823209	0.272409	239.919657	143.164312	46.894939	0.4223424
min	1.000000	1.000000	0.000000	0.000000	0.000000	50.000000	False
25%	1.000000	1.000000	1.000000	0.034384	0.000000	101.000000	1
50%	1.875000	3.000000	1.000000	18.297742	2.000000	111.000000	1
75%	2.625000	6.000000	1.000000	210.994248	85.000000	136.000000	1
max	20.000000	85.000000	1.000000	1258.932369	1184.000000	300.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
activity_freq	1.000000	0.345089	-0.050289	0.256427	0.225526	0.247110	-0.146746
days_active	0.345089	1.000000	-0.055666	0.825906	0.737583	0.622819	-0.443087
gender_cat	-0.050289	-0.055666	1.000000	-0.004907	-0.024281	0.054600	0.010345
lifetime	0.256427	0.825906	-0.004907	1.000000	0.863469	0.543556	-0.381632
max_interval	0.225526	0.737583	-0.024281	0.863469	1.000000	0.433997	-0.320157
reputation	0.247110	0.622819	0.054600	0.543556	0.433997	1.000000	-0.291969
dead	-0.146746	-0.443087	0.010345	-0.381632	-0.320157	-0.291969	1.000000

Dep. Variable:	days_active	No. Observations:	8125
Model:	GLM	Df Residuals:	8123
Model Family:	NegativeBinomial	Df Model:	1
Link Function:	log	Scale:	9.91852947776
Method:	IRLS	Log-Likelihood:	-33305.
Date:	Sun, 05 Oct 2014	Deviance:	21191.
Time:	23:07:09	Pearson chi2:	8.06e+04
No. Iterations:	8

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
const	2.7288	0.129	21.121	0.000	2.476 2.982
gender_cat	0.3768	0.134	2.803	0.005	0.113 0.640

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
activity_freq	1.000000	0.268531	-0.011003	-0.205129	-0.268695	0.318475	-0.112084
days_active	0.268531	1.000000	0.059173	0.523422	-0.044783	0.818487	-0.404213
gender_cat	-0.011003	0.059173	1.000000	0.101702	0.046823	0.079837	-0.016184
lifetime	-0.205129	0.523422	0.101702	1.000000	0.598659	0.415470	-0.255136
max_interval	-0.268695	-0.044783	0.046823	0.598659	1.000000	-0.085554	-0.043287
reputation	0.318475	0.818487	0.079837	0.415470	-0.085554	1.000000	-0.311134
dead	-0.112084	-0.404213	-0.016184	-0.255136	-0.043287	-0.311134	1.000000

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
activity_freq	1.000000	0.261831	-0.067385	0.262435	0.239374	0.086382	-0.059026
days_active	0.261831	1.000000	-0.112767	0.822551	0.799398	0.260494	-0.298644
gender_cat	-0.067385	-0.112767	1.000000	-0.046555	-0.059154	0.059909	0.027977
lifetime	0.262435	0.822551	-0.046555	1.000000	0.857933	0.308289	-0.262599
max_interval	0.239374	0.799398	-0.059154	0.857933	1.000000	0.270982	-0.258137
reputation	0.086382	0.260494	0.059909	0.308289	0.270982	1.000000	-0.054298
dead	-0.059026	-0.298644	0.027977	-0.262599	-0.258137	-0.054298	1.000000