Community: StackOverflow

Inicialization and importing data are at the end of this notebook. For better visualization of the analysis they were placed at the bottom, but it's necessary to run them first so the analysis work as expected. Click here to go there.

Data Segregation

All contributors



In [3]:

    
males = df[df['gender']=='Male']
females = df[df['gender']=='Female']
all_users = df.copy()



In [4]:

    
print females.median()
females.describe()









    



activity_freq      2.000000
days_active       13.000000
gender_cat         0.000000
lifetime         400.012305
max_interval     109.000000
reputation       132.000000
dead               0.000000
dtype: float64






    Out[4]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       7897.000000
       7897.000000
       7897
       7897.000000
       7897.000000
         7897.000000
            7897
    
    
      mean
          2.334396
         30.105610
          0
        507.451066
        166.153603
          694.373939
       0.4818285
    
    
      std
          1.508635
         62.035094
          0
        466.162353
        192.942626
         3526.726330
       0.4997013
    
    
      min
          1.000000
          1.000000
          0
          0.000000
          0.000000
           50.000000
           False
    
    
      25%
          1.444444
          5.000000
          0
         90.158949
         22.000000
           75.000000
               0
    
    
      50%
          2.000000
         13.000000
          0
        400.012305
        109.000000
          132.000000
               0
    
    
      75%
          2.722222
         30.000000
          0
        804.265243
        237.000000
          394.000000
               1
    
    
      max
         18.500000
       1187.000000
          0
       1978.319806
       1816.000000
       141184.000000
            True



In [5]:

    
print males.median()
males.describe()









    



activity_freq      1.833333
days_active       14.000000
gender_cat         1.000000
lifetime         552.830597
max_interval     141.000000
reputation       176.000000
dead               0.000000
dtype: float64






    Out[5]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       125340.000000
       125340.000000
       125340
       125340.000000
       125340.000000
       125340.000000
          125340
    
    
      mean
            2.195698
           37.163443
            1
          640.907015
          195.999737
         1169.622946
       0.4413276
    
    
      std
            1.501658
           80.493505
            0
          532.275464
          208.348216
         6567.785536
       0.4965476
    
    
      min
            1.000000
            1.000000
            1
            0.000000
            0.000000
           50.000000
           False
    
    
      25%
            1.352941
            5.000000
            1
          163.722270
           37.000000
           86.000000
               0
    
    
      50%
            1.833333
           14.000000
            1
          552.830597
          141.000000
          176.000000
               0
    
    
      75%
            2.521739
           35.000000
            1
         1008.412889
          280.000000
          606.000000
               1
    
    
      max
           45.857143
         1931.000000
            1
         1994.007311
         1892.000000
       640237.000000
            True



In [6]:

    
print all_users.median()
all_users.describe()









    



activity_freq      1.837838
days_active       14.000000
gender_cat         1.000000
lifetime         542.143747
max_interval     139.000000
reputation       173.000000
dead               0.000000
dtype: float64






    Out[6]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       133237.000000
       133237.000000
       133237.000000
       133237.000000
       133237.000000
       133237.000000
          133237
    
    
      mean
            2.203918
           36.745123
            0.940730
          632.997038
          194.230747
         1141.454784
       0.4437281
    
    
      std
            1.502424
           79.536283
            0.236131
          529.524465
          207.586004
         6428.748907
       0.4968252
    
    
      min
            1.000000
            1.000000
            0.000000
            0.000000
            0.000000
           50.000000
           False
    
    
      25%
            1.357143
            5.000000
            1.000000
          158.404719
           36.000000
           86.000000
               0
    
    
      50%
            1.837838
           14.000000
            1.000000
          542.143747
          139.000000
          173.000000
               0
    
    
      75%
            2.538462
           35.000000
            1.000000
          996.311754
          277.000000
          591.000000
               1
    
    
      max
           45.857143
         1931.000000
            1.000000
         1994.007311
         1892.000000
       640237.000000
            True

Top contributors



In [7]:

    
top_females = females[females["reputation"]> 1000]
top_males = males[males["reputation"]> 1000]

top_users = all_users[all_users["reputation"]> 1000]



In [8]:

    
print top_females.median()
top_females.describe()









    



activity_freq       2.605499
days_active        85.000000
gender_cat          0.000000
lifetime         1008.910835
max_interval      138.000000
reputation       2027.500000
dead                0.000000
dtype: float64






    Out[8]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       868.000000
        868.000000
       868
        868.000000
        868.000000
          868.000000
             868
    
    
      mean
         3.254567
        131.602535
         0
       1025.829577
        187.335253
         4651.165899
        0.235023
    
    
      std
         2.168423
        142.316046
         0
        490.574128
        169.879656
         9763.712921
       0.4242577
    
    
      min
         1.000000
          1.000000
         0
          0.000000
          0.000000
         1002.000000
           False
    
    
      25%
         1.960829
         51.000000
         0
        660.224362
         72.000000
         1330.750000
               0
    
    
      50%
         2.605499
         85.000000
         0
       1008.910835
        138.000000
         2027.500000
               0
    
    
      75%
         3.735379
        153.000000
         0
       1407.844001
        241.000000
         3837.250000
               0
    
    
      max
        17.769231
       1187.000000
         0
       1978.319806
       1470.000000
       141184.000000
            True



In [9]:

    
print top_males.median()
top_males.describe()









    



activity_freq       2.379310
days_active        89.000000
gender_cat          1.000000
lifetime         1186.508306
max_interval      156.000000
reputation       2272.500000
dead                0.000000
dtype: float64






    Out[9]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       21572.000000
       21572.000000
       21572
       21572.000000
       21572.000000
        21572.000000
           21572
    
    
      mean
           3.016451
         135.561700
           1
        1159.153001
         202.813184
         5655.511218
       0.2146764
    
    
      std
           2.254264
         155.665837
           0
         507.020628
         173.629913
        15036.391721
       0.4106072
    
    
      min
           1.000000
           1.000000
           1
           0.000000
           0.000000
         1001.000000
           False
    
    
      25%
           1.811881
          50.000000
           1
         779.427245
          86.000000
         1425.000000
               0
    
    
      50%
           2.379310
          89.000000
           1
        1186.508306
         156.000000
         2272.500000
               0
    
    
      75%
           3.385393
         159.000000
           1
        1587.056340
         266.000000
         4612.250000
               0
    
    
      max
          45.857143
        1931.000000
           1
        1994.007311
        1720.000000
       640237.000000
            True



In [10]:

    
print top_users.median()
top_users.describe()









    



activity_freq       2.385542
days_active        89.000000
gender_cat          1.000000
lifetime         1180.337493
max_interval      155.000000
reputation       2263.000000
dead                0.000000
dtype: float64






    Out[10]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       22440.000000
       22440.000000
       22440.000000
       22440.000000
       22440.000000
        22440.000000
           22440
    
    
      mean
           3.025662
         135.408556
           0.961319
        1153.995928
         202.214483
         5616.662210
       0.2154635
    
    
      std
           2.251426
         155.169758
           0.192838
         507.036016
         173.508318
        14868.358312
       0.4111526
    
    
      min
           1.000000
           1.000000
           0.000000
           0.000000
           0.000000
         1001.000000
           False
    
    
      25%
           1.815947
          50.000000
           1.000000
         775.158708
          85.000000
         1421.000000
               0
    
    
      50%
           2.385542
          89.000000
           1.000000
        1180.337493
         155.000000
         2263.000000
               0
    
    
      75%
           3.400000
         159.000000
           1.000000
        1580.400121
         265.000000
         4580.250000
               0
    
    
      max
          45.857143
        1931.000000
           1.000000
        1994.007311
        1720.000000
       640237.000000
            True

Common women contributors



In [11]:

    
common_females = females[females["reputation"] <= 1000]
common_males = males[males["reputation"] <= 1000]

common_users = all_users[all_users["reputation"]<= 1000]



In [12]:

    
print common_females.median()
common_females.describe()









    



activity_freq      1.900000
days_active       11.000000
gender_cat         0.000000
lifetime         340.848467
max_interval     104.000000
reputation       116.000000
dead               1.000000
dtype: float64






    Out[12]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       7029.000000
       7029.000000
       7029
       7029.000000
       7029.000000
       7029.000000
            7029
    
    
      mean
          2.220766
         17.571916
          0
        443.437331
        163.537914
        205.756011
       0.5123062
    
    
      std
          1.363644
         19.890419
          0
        420.916398
        195.453770
        203.109675
       0.4998841
    
    
      min
          1.000000
          1.000000
          0
          0.000000
          0.000000
         50.000000
           False
    
    
      25%
          1.400000
          5.000000
          0
         68.709563
         17.000000
         71.000000
               0
    
    
      50%
          1.900000
         11.000000
          0
        340.848467
        104.000000
        116.000000
               1
    
    
      75%
          2.600000
         23.000000
          0
        709.795095
        236.000000
        271.000000
               1
    
    
      max
         18.500000
        218.000000
          0
       1948.328314
       1816.000000
        999.000000
            True



In [13]:

    
print common_males.median()
common_males.describe()









    



activity_freq      1.733333
days_active       11.000000
gender_cat         1.000000
lifetime         441.254514
max_interval     136.000000
reputation       133.000000
dead               0.000000
dtype: float64






    Out[13]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       103768.000000
       103768.000000
       103768
       103768.000000
       103768.000000
       103768.000000
          103768
    
    
      mean
            2.025074
           16.707742
            1
          533.170502
          194.583311
          237.065878
       0.4884454
    
    
      std
            1.224021
           18.910212
            0
          470.461833
          214.836118
          223.953725
       0.4998689
    
    
      min
            1.000000
            1.000000
            1
            0.000000
            0.000000
           50.000000
           False
    
    
      25%
            1.285714
            4.000000
            1
          106.444369
           23.000000
           77.000000
               0
    
    
      50%
            1.733333
           11.000000
            1
          441.254514
          136.000000
          133.000000
               0
    
    
      75%
            2.357143
           22.000000
            1
          845.712161
          284.000000
          340.000000
               1
    
    
      max
           38.000000
          340.000000
            1
         1988.652516
         1892.000000
         1000.000000
            True



In [14]:

    
print common_users.median()
common_users.describe()









    



activity_freq      1.750000
days_active       11.000000
gender_cat         1.000000
lifetime         433.899242
max_interval     134.000000
reputation       132.000000
dead               0.000000
dtype: float64






    Out[14]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      count
       110797.000000
       110797.000000
       110797.000000
       110797.000000
       110797.000000
       110797.000000
          110797
    
    
      mean
            2.037489
           16.762566
            0.936560
          527.477799
          192.613780
          235.079569
       0.4899591
    
    
      std
            1.234264
           18.974977
            0.243754
          467.984372
          213.791891
          222.819234
       0.4999014
    
    
      min
            1.000000
            1.000000
            0.000000
            0.000000
            0.000000
           50.000000
           False
    
    
      25%
            1.285714
            4.000000
            1.000000
          103.061397
           22.000000
           76.000000
               0
    
    
      50%
            1.750000
           11.000000
            1.000000
          433.899242
          134.000000
          132.000000
               0
    
    
      75%
            2.375000
           22.000000
            1.000000
          836.893566
          281.000000
          337.000000
               1
    
    
      max
           38.000000
          340.000000
            1.000000
         1988.652516
         1892.000000
         1000.000000
            True

Third Question: Do women and men contribute for the same amount of time ?

Hypothesis 1: The survival time of users is the same among genders.

H0: survivalTime(Males) = survivalTime(Females);

H1: survivalTime(Males) != survivalTime(Females).

Data



In [15]:

    
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()

T = all_users["lifetime"] #measure in days
C = all_users["dead"]

females_ = all_users["gender"] == "Female"
males_ = all_users["gender"] == "Male"



In [16]:

    
fig = pyplot.figure(figsize=(12, 6)) 
ax = pyplot.subplot(111)

kmf.fit(T[females_], event_observed=C[females_], label="Female")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

kmf.fit(T[males_], event_observed=C[males_], label="Male")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

pyplot.ylim(0,1)
pyplot.title("Lifespans of users from different genders")









    



Median:  841.138055058
Median:  1149.80419248






    Out[16]:





<matplotlib.text.Text at 0x12e770b10>

Hypothesis test



In [17]:

    
from lifelines.statistics import logrank_test

summary, p_value, test_results = logrank_test(T[females_], T[males_], C[females_], C[males_], alpha=.95 )









    



Results
   df: 1
   alpha: 0.95
   t 0: -1
   test: logrank
   null distribution: chi squared

   __ p-value ___|__ test statistic __|__ test results __
         0.00000 |              239.829 |     True

Looking at the top contributors



In [18]:

    
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()

T = top_users["lifetime"] #measure in days
C = top_users["dead"]

females_ = top_users["gender"] == "Female"
males_ = top_users["gender"] == "Male"



In [19]:

    
fig = pyplot.figure(figsize=(12, 6)) 
ax = pyplot.subplot(111)

kmf.fit(T[females_], event_observed=C[females_], label="Female")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

kmf.fit(T[males_], event_observed=C[males_], label="Male")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

pyplot.ylim(0,1)
pyplot.title("Lifespans of users from different genders")









    



Median:  inf
Median:  inf






    Out[19]:





<matplotlib.text.Text at 0x10e990d90>

Hypothesis test



In [20]:

    
from lifelines.statistics import logrank_test

summary, p_value, test_results = logrank_test(T[females_], T[males_], C[females_], C[males_], alpha=.95 )









    



Results
   df: 1
   alpha: 0.95
   t 0: -1
   test: logrank
   null distribution: chi squared

   __ p-value ___|__ test statistic __|__ test results __
         0.00005 |              16.324 |     True

Looking at the common contributors



In [21]:

    
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()

T = common_users["lifetime"] #measure in days
C = common_users["dead"]

females_ = common_users["gender"] == "Female"
males_ = common_users["gender"] == "Male"



In [22]:

    
fig = pyplot.figure(figsize=(12, 6)) 
ax = pyplot.subplot(111)

kmf.fit(T[females_], event_observed=C[females_], label="Female")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

kmf.fit(T[males_], event_observed=C[males_], label="Male")
kmf.plot(ax=ax, ci_force_lines=False)
print "Median: ", kmf.median_

pyplot.ylim(0,1)
pyplot.title("Lifespans of users from different genders")









    



Median:  665.876370556
Median:  870.482279016






    Out[22]:





<matplotlib.text.Text at 0x11a280490>

Hypotesis test



In [23]:

    
from lifelines.statistics import logrank_test

summary, p_value, test_results = logrank_test(T[females_], T[males_], C[females_], C[males_], alpha=.95 )









    



Results
   df: 1
   alpha: 0.95
   t 0: -1
   test: logrank
   null distribution: chi squared

   __ p-value ___|__ test statistic __|__ test results __
         0.00000 |              113.496 |     True

Hypothesis 2: The amount of activity days is the same between genders.

H0: daysActive(Males) = daysActive(Females);

H1: daysActive(Males) != daysActive(Females).

Correlation and Binomial Negative Regression



In [24]:

    
all_users.corr(method="spearman")









    Out[24]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      activity_freq
       1.000000
       0.439216
      -0.030015
       0.112770
       0.062791
       0.311337
      -0.211586
    
    
      days_active
       0.439216
       1.000000
       0.013928
       0.672904
       0.355899
       0.723341
      -0.495259
    
    
      gender_cat
      -0.030015
       0.013928
       1.000000
       0.055488
       0.037444
       0.052146
      -0.018992
    
    
      lifetime
       0.112770
       0.672904
       0.055488
       1.000000
       0.755676
       0.538123
      -0.426949
    
    
      max_interval
       0.062791
       0.355899
       0.037444
       0.755676
       1.000000
       0.227582
      -0.290184
    
    
      reputation
       0.311337
       0.723341
       0.052146
       0.538123
       0.227582
       1.000000
      -0.262980
    
    
      dead
      -0.211586
      -0.495259
      -0.018992
      -0.426949
      -0.290184
      -0.262980
       1.000000



In [25]:

    
exog = all_users["gender_cat"]
exog = sm.add_constant(exog, prepend=True)
endog = all_users["days_active"]

mod_nbin = sm.GLM(endog, exog, family=sm.families.NegativeBinomial())

res_nbin = mod_nbin.fit()

res_nbin.summary()









    Out[25]:





Generalized Linear Model Regression Results

  Dep. Variable:      days_active      No. Observations:      133237    


  Model:                  GLM          Df Residuals:          133235    


  Model Family:    NegativeBinomial    Df Model:                   1    


  Link Function:          log          Scale:              4.54114490135


  Method:                IRLS          Log-Likelihood:      -6.1507e+05 


  Date:            Sun, 05 Oct 2014    Deviance:            2.4668e+05  


  Time:                23:26:57        Pearson chi2:         6.05e+05   


  No. Iterations:          9                                            




                coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const           3.4047      0.024    139.680   0.000      3.357     3.452


  gender_cat      0.2106      0.025      8.382   0.000      0.161     0.260

Data's shape



In [26]:

    
nobs = res_nbin.nobs

y = endog / endog.sum(1)

yhat = res_nbin.mu

pyplot.scatter(yhat, y)

res_nbin.params









    Out[26]:





const         3.404712
gender_cat    0.210614
dtype: float64

Looking at the top contributors



In [27]:

    
top_users.corr(method="spearman")









    Out[27]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      activity_freq
       1.000000
       0.236617
      -0.033690
      -0.326952
      -0.216585
       0.267009
      -0.110130
    
    
      days_active
       0.236617
       1.000000
       0.003058
       0.299475
      -0.375538
       0.653917
      -0.399226
    
    
      gender_cat
      -0.033690
       0.003058
       1.000000
       0.051717
       0.024442
       0.026455
      -0.008762
    
    
      lifetime
      -0.326952
       0.299475
       0.051717
       1.000000
       0.415860
       0.300765
      -0.225730
    
    
      max_interval
      -0.216585
      -0.375538
       0.024442
       0.415860
       1.000000
      -0.177112
       0.089947
    
    
      reputation
       0.267009
       0.653917
       0.026455
       0.300765
      -0.177112
       1.000000
      -0.133198
    
    
      dead
      -0.110130
      -0.399226
      -0.008762
      -0.225730
       0.089947
      -0.133198
       1.000000



In [28]:

    
exog = top_users["gender_cat"]
exog = sm.add_constant(exog, prepend=True)
endog = top_users["days_active"]

mod_nbin = sm.GLM(endog, exog, family=sm.families.NegativeBinomial())

res_nbin = mod_nbin.fit()

res_nbin.summary()









    Out[28]:





Generalized Linear Model Regression Results

  Dep. Variable:      days_active      No. Observations:       22440    


  Model:                  GLM          Df Residuals:           22438    


  Model Family:    NegativeBinomial    Df Model:                   1    


  Link Function:          log          Scale:              1.30321188992


  Method:                IRLS          Log-Likelihood:      -1.3266e+05 


  Date:            Sun, 05 Oct 2014    Deviance:               19469.   


  Time:                23:27:24        Pearson chi2:         2.92e+04   


  No. Iterations:          6                                            




                coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const           4.8798      0.039    125.461   0.000      4.804     4.956


  gender_cat      0.0296      0.040      0.747   0.455     -0.048     0.107

Data's shape



In [29]:

    
nobs = res_nbin.nobs

y = endog / endog.sum(1)

yhat = res_nbin.mu

pyplot.scatter(yhat, y)

res_nbin.params









    Out[29]:





const         4.879786
gender_cat    0.029641
dtype: float64

Looking at the common contributors



In [30]:

    
common_users.corr(method="spearman")









    Out[30]:






  
    
      
      activity_freq
      days_active
      gender_cat
      lifetime
      max_interval
      reputation
      dead
    
  
  
    
      activity_freq
       1.000000
       0.390712
      -0.042854
       0.071224
       0.098429
       0.197520
      -0.180806
    
    
      days_active
       0.390712
       1.000000
      -0.010875
       0.626381
       0.448729
       0.571510
      -0.478084
    
    
      gender_cat
      -0.042854
      -0.010875
       1.000000
       0.041786
       0.034733
       0.036558
      -0.011563
    
    
      lifetime
       0.071224
       0.626381
       0.041786
       1.000000
       0.839540
       0.409554
      -0.411050
    
    
      max_interval
       0.098429
       0.448729
       0.034733
       0.839540
       1.000000
       0.250896
      -0.329168
    
    
      reputation
       0.197520
       0.571510
       0.036558
       0.409554
       0.250896
       1.000000
      -0.183207
    
    
      dead
      -0.180806
      -0.478084
      -0.011563
      -0.411050
      -0.329168
      -0.183207
       1.000000



In [31]:

    
exog = common_users["gender_cat"]
exog = sm.add_constant(exog, prepend=True)
endog = common_users["days_active"]

mod_nbin = sm.GLM(endog, exog, family=sm.families.NegativeBinomial())

res_nbin = mod_nbin.fit()

res_nbin.summary()









    Out[31]:





Generalized Linear Model Regression Results

  Dep. Variable:      days_active      No. Observations:      110797    


  Model:                  GLM          Df Residuals:          110795    


  Model Family:    NegativeBinomial    Df Model:                   1    


  Link Function:          log          Scale:              1.20891124419


  Method:                IRLS          Log-Likelihood:      -4.2638e+05 


  Date:            Sun, 05 Oct 2014    Deviance:            1.1516e+05  


  Time:                23:28:00        Pearson chi2:         1.34e+05   


  No. Iterations:          8                                            




                coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const           2.8663      0.013    212.595   0.000      2.840     2.893


  gender_cat     -0.0504      0.014     -3.619   0.000     -0.078    -0.023

Data's shape



In [32]:

    
nobs = res_nbin.nobs

y = endog / endog.sum(1)

yhat = res_nbin.mu

pyplot.scatter(yhat, y)

res_nbin.params









    Out[32]:





const         2.866302
gender_cat   -0.050430
dtype: float64

Hypothesis 3: The frequency of participation is the same between genders.

H0: frequency(Males) = frequency(Females);

H1: frequency(Males) != frequency(Females).

Data



In [33]:

    
females_frequency = females['activity_freq']
males_frequency = males['activity_freq']

Data's summary



In [34]:

    
print "Female:"
print females_frequency.describe()
print "Median: ", females_frequency.median()

print 
print "Male:"
print males_frequency.describe()
print "Median: ", males_frequency.median()









    



Female:
count    7897.000000
mean        2.334396
std         1.508635
min         1.000000
25%         1.444444
50%         2.000000
75%         2.722222
max        18.500000
dtype: float64
Median:  2.0

Male:
count    125340.000000
mean          2.195698
std           1.501658
min           1.000000
25%           1.352941
50%           1.833333
75%           2.521739
max          45.857143
dtype: float64
Median:  1.83333333333

The data's shape



In [35]:

    
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
females_frequency.hist(ax=axes[0])
axes[0].set_title("Post Frequency by Females - Histogram")
males_frequency.hist(ax=axes[1])
axes[1].set_title("Post Frequency by Males - Histogram")









    Out[35]:





<matplotlib.text.Text at 0x12e663750>

Hypothesis test



In [36]:

    
print "Two-sample Chi-Square test: ", stats.chi2_contingency( females_frequency, males_frequency )[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(females_frequency, males_frequency)[1]









    



Two-sample Chi-Square test:  1.0
Two-sample Mann Whitney U test:  7.28282939984e-28

Looking at the top contributors



In [37]:

    
top_females_frequency = top_females['activity_freq']
top_males_frequency = top_males['activity_freq']

Data's summary



In [38]:

    
print "Female:"
print top_females_frequency.describe()
print "Median: ", top_females_frequency.median()

print 
print "Male:"
print top_males_frequency.describe()
print "Median: ", top_males_frequency.median()









    



Female:
count    868.000000
mean       3.254567
std        2.168423
min        1.000000
25%        1.960829
50%        2.605499
75%        3.735379
max       17.769231
dtype: float64
Median:  2.60549943883

Male:
count    21572.000000
mean         3.016451
std          2.254264
min          1.000000
25%          1.811881
50%          2.379310
75%          3.385393
max         45.857143
dtype: float64
Median:  2.37931034483

The data's shape



In [39]:

    
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
top_females_frequency.hist(ax=axes[0])
axes[0].set_title("Post Frequency by Females - Histogram")
top_males_frequency.hist(ax=axes[1])
axes[1].set_title("Post Frequency by Males - Histogram")









    Out[39]:





<matplotlib.text.Text at 0x12223c690>

Hypotesis test



In [40]:

    
print "Two-sample Chi-Square test: ", stats.chi2_contingency( top_females_frequency, top_males_frequency )[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(top_females_frequency, top_males_frequency)[1]









    



Two-sample Chi-Square test:  1.0
Two-sample Mann Whitney U test:  4.49803451219e-07

Looking at the common contributors



In [41]:

    
common_females_frequency = common_females['activity_freq']
common_males_frequency = common_males['activity_freq']

Data's summary



In [42]:

    
print "Female:"
print common_females_frequency.describe()
print "Median: ", common_females_frequency.median()

print 
print "Male:"
print common_males_frequency.describe()
print "Median: ", common_males_frequency.median()









    



Female:
count    7029.000000
mean        2.220766
std         1.363644
min         1.000000
25%         1.400000
50%         1.900000
75%         2.600000
max        18.500000
dtype: float64
Median:  1.9

Male:
count    103768.000000
mean          2.025074
std           1.224021
min           1.000000
25%           1.285714
50%           1.733333
75%           2.357143
max          38.000000
dtype: float64
Median:  1.73333333333

The data's shape



In [43]:

    
fig, axes = pyplot.subplots(nrows=1, ncols=2, figsize=(15,7))
common_females_frequency.hist(ax=axes[0])
axes[0].set_title("Post Frequency by Females - Histogram")
common_males_frequency.hist(ax=axes[1])
axes[1].set_title("Post Frequency by Males - Histogram")









    Out[43]:





<matplotlib.text.Text at 0x125ef7950>

Hypotesis test



In [44]:

    
print "Two-sample Chi-Square test: ", stats.chi2_contingency( common_females_frequency, common_males_frequency )[1]
print "Two-sample Mann Whitney U test: ",2* stats.mannwhitneyu(common_females_frequency, common_males_frequency)[1]









    



Two-sample Chi-Square test:  1.0
Two-sample Mann Whitney U test:  5.69164358077e-46

Inicialization

Here you can find the data importing and some useful functions used for analysing the data. Please, run this first, otherwise the analysis will not work.

Importing the data from the MongoDB database and inserting into a panda dataframe for easy manipulation.



In [1]:

    
from __future__ import division
import pymongo, time, pylab, numpy, pandas, math
from scipy import stats
import matplotlib as mpl
from matplotlib import pyplot
import statsmodels.api as sm

%matplotlib inline
mpl.style.use('ggplot')
# pyplot.rcdefaults()

client = pymongo.MongoClient('localhost', 27017)

community = 'stackoverflow'
stats_db = client[community].statistics

cursor = stats_db.find({'$or': [{'questions_total':{'$gt':0}}, {'answers_total':{'$gt':0}}, 
                                {'comments_total':{'$gt':0}}], 
                        'gender': {'$ne': "Unknown"} }, 
                       {u'_id': False, u'dates': True, u'reputation': True,
                        u'joined': True, u'gender':True,
                        'lifetime': True, 'max_interval': True, 'days_active': True, 
                        'gender_cat': True, 'activity_freq': True})

df =  pandas.DataFrame(list(cursor))



In [2]:

    
import datetime

def seen_death(row):
    recent = datetime.datetime(2014,1,20) - datetime.timedelta(days=int(df["max_interval"].mean()))
    return row["dates"][-1] < recent

df["dead"] = df.apply(seen_death, axis=1)



In [ ]:

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	7897.000000	7897.000000	7897	7897.000000	7897.000000	7897.000000	7897
mean	2.334396	30.105610	0	507.451066	166.153603	694.373939	0.4818285
std	1.508635	62.035094	0	466.162353	192.942626	3526.726330	0.4997013
min	1.000000	1.000000	0	0.000000	0.000000	50.000000	False
25%	1.444444	5.000000	0	90.158949	22.000000	75.000000	0
50%	2.000000	13.000000	0	400.012305	109.000000	132.000000	0
75%	2.722222	30.000000	0	804.265243	237.000000	394.000000	1
max	18.500000	1187.000000	0	1978.319806	1816.000000	141184.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	125340.000000	125340.000000	125340	125340.000000	125340.000000	125340.000000	125340
mean	2.195698	37.163443	1	640.907015	195.999737	1169.622946	0.4413276
std	1.501658	80.493505	0	532.275464	208.348216	6567.785536	0.4965476
min	1.000000	1.000000	1	0.000000	0.000000	50.000000	False
25%	1.352941	5.000000	1	163.722270	37.000000	86.000000	0
50%	1.833333	14.000000	1	552.830597	141.000000	176.000000	0
75%	2.521739	35.000000	1	1008.412889	280.000000	606.000000	1
max	45.857143	1931.000000	1	1994.007311	1892.000000	640237.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	133237.000000	133237.000000	133237.000000	133237.000000	133237.000000	133237.000000	133237
mean	2.203918	36.745123	0.940730	632.997038	194.230747	1141.454784	0.4437281
std	1.502424	79.536283	0.236131	529.524465	207.586004	6428.748907	0.4968252
min	1.000000	1.000000	0.000000	0.000000	0.000000	50.000000	False
25%	1.357143	5.000000	1.000000	158.404719	36.000000	86.000000	0
50%	1.837838	14.000000	1.000000	542.143747	139.000000	173.000000	0
75%	2.538462	35.000000	1.000000	996.311754	277.000000	591.000000	1
max	45.857143	1931.000000	1.000000	1994.007311	1892.000000	640237.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	868.000000	868.000000	868	868.000000	868.000000	868.000000	868
mean	3.254567	131.602535	0	1025.829577	187.335253	4651.165899	0.235023
std	2.168423	142.316046	0	490.574128	169.879656	9763.712921	0.4242577
min	1.000000	1.000000	0	0.000000	0.000000	1002.000000	False
25%	1.960829	51.000000	0	660.224362	72.000000	1330.750000	0
50%	2.605499	85.000000	0	1008.910835	138.000000	2027.500000	0
75%	3.735379	153.000000	0	1407.844001	241.000000	3837.250000	0
max	17.769231	1187.000000	0	1978.319806	1470.000000	141184.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	21572.000000	21572.000000	21572	21572.000000	21572.000000	21572.000000	21572
mean	3.016451	135.561700	1	1159.153001	202.813184	5655.511218	0.2146764
std	2.254264	155.665837	0	507.020628	173.629913	15036.391721	0.4106072
min	1.000000	1.000000	1	0.000000	0.000000	1001.000000	False
25%	1.811881	50.000000	1	779.427245	86.000000	1425.000000	0
50%	2.379310	89.000000	1	1186.508306	156.000000	2272.500000	0
75%	3.385393	159.000000	1	1587.056340	266.000000	4612.250000	0
max	45.857143	1931.000000	1	1994.007311	1720.000000	640237.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	22440.000000	22440.000000	22440.000000	22440.000000	22440.000000	22440.000000	22440
mean	3.025662	135.408556	0.961319	1153.995928	202.214483	5616.662210	0.2154635
std	2.251426	155.169758	0.192838	507.036016	173.508318	14868.358312	0.4111526
min	1.000000	1.000000	0.000000	0.000000	0.000000	1001.000000	False
25%	1.815947	50.000000	1.000000	775.158708	85.000000	1421.000000	0
50%	2.385542	89.000000	1.000000	1180.337493	155.000000	2263.000000	0
75%	3.400000	159.000000	1.000000	1580.400121	265.000000	4580.250000	0
max	45.857143	1931.000000	1.000000	1994.007311	1720.000000	640237.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	7029.000000	7029.000000	7029	7029.000000	7029.000000	7029.000000	7029
mean	2.220766	17.571916	0	443.437331	163.537914	205.756011	0.5123062
std	1.363644	19.890419	0	420.916398	195.453770	203.109675	0.4998841
min	1.000000	1.000000	0	0.000000	0.000000	50.000000	False
25%	1.400000	5.000000	0	68.709563	17.000000	71.000000	0
50%	1.900000	11.000000	0	340.848467	104.000000	116.000000	1
75%	2.600000	23.000000	0	709.795095	236.000000	271.000000	1
max	18.500000	218.000000	0	1948.328314	1816.000000	999.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	103768.000000	103768.000000	103768	103768.000000	103768.000000	103768.000000	103768
mean	2.025074	16.707742	1	533.170502	194.583311	237.065878	0.4884454
std	1.224021	18.910212	0	470.461833	214.836118	223.953725	0.4998689
min	1.000000	1.000000	1	0.000000	0.000000	50.000000	False
25%	1.285714	4.000000	1	106.444369	23.000000	77.000000	0
50%	1.733333	11.000000	1	441.254514	136.000000	133.000000	0
75%	2.357143	22.000000	1	845.712161	284.000000	340.000000	1
max	38.000000	340.000000	1	1988.652516	1892.000000	1000.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
count	110797.000000	110797.000000	110797.000000	110797.000000	110797.000000	110797.000000	110797
mean	2.037489	16.762566	0.936560	527.477799	192.613780	235.079569	0.4899591
std	1.234264	18.974977	0.243754	467.984372	213.791891	222.819234	0.4999014
min	1.000000	1.000000	0.000000	0.000000	0.000000	50.000000	False
25%	1.285714	4.000000	1.000000	103.061397	22.000000	76.000000	0
50%	1.750000	11.000000	1.000000	433.899242	134.000000	132.000000	0
75%	2.375000	22.000000	1.000000	836.893566	281.000000	337.000000	1
max	38.000000	340.000000	1.000000	1988.652516	1892.000000	1000.000000	True

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
activity_freq	1.000000	0.439216	-0.030015	0.112770	0.062791	0.311337	-0.211586
days_active	0.439216	1.000000	0.013928	0.672904	0.355899	0.723341	-0.495259
gender_cat	-0.030015	0.013928	1.000000	0.055488	0.037444	0.052146	-0.018992
lifetime	0.112770	0.672904	0.055488	1.000000	0.755676	0.538123	-0.426949
max_interval	0.062791	0.355899	0.037444	0.755676	1.000000	0.227582	-0.290184
reputation	0.311337	0.723341	0.052146	0.538123	0.227582	1.000000	-0.262980
dead	-0.211586	-0.495259	-0.018992	-0.426949	-0.290184	-0.262980	1.000000

Dep. Variable:	days_active	No. Observations:	133237
Model:	GLM	Df Residuals:	133235
Model Family:	NegativeBinomial	Df Model:	1
Link Function:	log	Scale:	4.54114490135
Method:	IRLS	Log-Likelihood:	-6.1507e+05
Date:	Sun, 05 Oct 2014	Deviance:	2.4668e+05
Time:	23:26:57	Pearson chi2:	6.05e+05
No. Iterations:	9

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
const	3.4047	0.024	139.680	0.000	3.357 3.452
gender_cat	0.2106	0.025	8.382	0.000	0.161 0.260

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
activity_freq	1.000000	0.236617	-0.033690	-0.326952	-0.216585	0.267009	-0.110130
days_active	0.236617	1.000000	0.003058	0.299475	-0.375538	0.653917	-0.399226
gender_cat	-0.033690	0.003058	1.000000	0.051717	0.024442	0.026455	-0.008762
lifetime	-0.326952	0.299475	0.051717	1.000000	0.415860	0.300765	-0.225730
max_interval	-0.216585	-0.375538	0.024442	0.415860	1.000000	-0.177112	0.089947
reputation	0.267009	0.653917	0.026455	0.300765	-0.177112	1.000000	-0.133198
dead	-0.110130	-0.399226	-0.008762	-0.225730	0.089947	-0.133198	1.000000

	activity_freq	days_active	gender_cat	lifetime	max_interval	reputation	dead
activity_freq	1.000000	0.390712	-0.042854	0.071224	0.098429	0.197520	-0.180806
days_active	0.390712	1.000000	-0.010875	0.626381	0.448729	0.571510	-0.478084
gender_cat	-0.042854	-0.010875	1.000000	0.041786	0.034733	0.036558	-0.011563
lifetime	0.071224	0.626381	0.041786	1.000000	0.839540	0.409554	-0.411050
max_interval	0.098429	0.448729	0.034733	0.839540	1.000000	0.250896	-0.329168
reputation	0.197520	0.571510	0.036558	0.409554	0.250896	1.000000	-0.183207
dead	-0.180806	-0.478084	-0.011563	-0.411050	-0.329168	-0.183207	1.000000