notebook.community

Edit and run



In [56]:

    
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import statsmodels.formula.api as smf
%matplotlib inline



In [2]:

    
metadata = pd.read_csv('/Users/tunder/Dropbox/python/character/metadata/filtered_fiction_plus_18c.tsv', sep ='\t')
metadata.head()









    Out[2]:






  
    
      
      docid
      volid
      recordid
      author
      firstname
      inferreddate
      birthdate
      authgender
      enumcron
      title
    
  
  
    
      0
      14930
      uva.x004123163
      NaN
      Swift, Jonathan,
      Jonathan
      1784
      NaN
      m
      v.1
      The works of the Rev. Dr. Jonathan Swift
    
    
      1
      14931
      uva.x004123168
      NaN
      Swift, Jonathan,
      Jonathan
      1784
      NaN
      m
      v.6
      The works of the Rev. Dr. Jonathan Swift
    
    
      2
      14932
      uva.x030576706
      NaN
      Swift, Jonathan,
      Jonathan
      1784
      NaN
      m
      v.11
      The works of the Rev. Dr. Jonathan Swift
    
    
      3
      14933
      uva.x000530839
      NaN
      Swift, Jonathan,
      Jonathan
      1784
      NaN
      m
      v.12
      The works of the Rev. Dr. Jonathan Swift
    
    
      4
      14934
      nyp.33433076096019
      NaN
      Swift, Jonathan,
      Jonathan
      1784
      NaN
      m
      v. 14
      The works of the Rev. Dr. Jonathan Swift



In [3]:

    
data = pd.read_csv('prestige_character_probabilities.tsv', sep = '\t', dtype = {'docid': 'object'})
data.head()









    Out[3]:






  
    
      
      docid
      charid
      gender
      pubdate
      numwords
      probability
    
  
  
    
      0
      0
      0|Betsey
      f
      1891
      334
      0.462642
    
    
      1
      0
      0|Phil
      m
      1891
      12
      0.140581
    
    
      2
      0
      0|Elizabeth
      f
      1891
      82
      0.366735
    
    
      3
      0
      0|Mr.Jones
      m
      1891
      526
      0.553426
    
    
      4
      0
      0|Mr.Mitford
      m
      1891
      14
      0.386104



In [4]:

    
grouped = data.loc[:, ["probability", "gender", 'pubdate']].groupby('gender')
bygender = grouped.aggregate(np.mean)
bygender.head()









    Out[4]:






  
    
      
      probability
      pubdate
    
    
      gender
      
      
    
  
  
    
      f
      0.547890
      1946.901165
    
    
      m
      0.438264
      1948.086980
    
    
      u
      0.473054
      1955.238318



In [50]:

    
authormeta = pd.read_csv('output/authormeta.tsv', sep = '\t')
authormeta['binaryauth'] = authormeta.authgender.map({'f': 1, 'm': 0})
authormeta.head()









    Out[50]:






  
    
      
      author
      num_stories
      reviewed
      authgender
      meandate
      mean_prestige
      mean_sales
      numchars
      charsize
      pct_women
      wordratio
      prob_diff
      weighted_diff
      prob_stdev
      prob_mean
      binaryauth
    
  
  
    
      0
      Beckett, Samuel
      13
      1
      m
      1966.307692
      0.820090
      0.362205
      13.846154
      79.398322
      0.368146
      0.309553
      0.052216
      0.057434
      0.045178
      0.505375
      0.0
    
    
      1
      Haggard, H. Rider
      17
      1
      m
      1898.235294
      0.534054
      0.899441
      18.117647
      343.630334
      0.281828
      0.369872
      0.044399
      0.053472
      0.055781
      0.481296
      0.0
    
    
      2
      Castlemon, Harry
      30
      0
      m
      1886.433333
      0.195307
      0.777778
      22.466667
      322.592651
      0.066558
      0.030604
      0.085897
      0.078611
      0.058242
      0.445997
      0.0
    
    
      3
      Pidgin, Charles Felton
      12
      0
      m
      1905.166667
      0.179293
      0.508571
      40.000000
      228.147965
      0.291458
      0.272297
      0.072691
      0.080473
      0.065104
      0.481867
      0.0
    
    
      4
      Lewis, Wyndham
      15
      1
      m
      1945.533333
      0.697906
      0.453704
      31.000000
      202.572051
      0.191909
      0.172247
      0.024949
      0.030662
      0.050687
      0.487141
      0.0



In [51]:

    
authormeta.corr()









    Out[51]:






  
    
      
      num_stories
      reviewed
      meandate
      mean_prestige
      mean_sales
      numchars
      charsize
      pct_women
      wordratio
      prob_diff
      weighted_diff
      prob_stdev
      prob_mean
      binaryauth
    
  
  
    
      num_stories
      1.000000
      0.218885
      0.136339
      0.185862
      0.546059
      0.063713
      0.014521
      -0.067036
      -0.059694
      -0.032356
      -0.042344
      -0.072736
      -0.013056
      -0.148931
    
    
      reviewed
      0.218885
      1.000000
      0.136778
      0.476159
      0.288765
      0.043027
      -0.059852
      -0.021178
      0.002188
      -0.052633
      -0.054550
      -0.064606
      0.079513
      -0.053148
    
    
      meandate
      0.136339
      0.136778
      1.000000
      0.164638
      -0.029856
      0.043027
      -0.143136
      -0.144073
      -0.125291
      -0.265918
      -0.241877
      -0.284284
      0.030538
      -0.110871
    
    
      mean_prestige
      0.185862
      0.476159
      0.164638
      1.000000
      0.187538
      0.031609
      -0.118692
      0.099911
      0.105872
      -0.134599
      -0.150892
      -0.122698
      0.237385
      0.033229
    
    
      mean_sales
      0.546059
      0.288765
      -0.029856
      0.187538
      1.000000
      0.087558
      0.066422
      -0.049083
      -0.033431
      0.045883
      -0.010033
      -0.043694
      -0.031284
      -0.134423
    
    
      numchars
      0.063713
      0.043027
      0.043027
      0.031609
      0.087558
      1.000000
      -0.029548
      0.047587
      0.053322
      0.020753
      0.005668
      0.191749
      -0.001760
      0.021899
    
    
      charsize
      0.014521
      -0.059852
      -0.143136
      -0.118692
      0.066422
      -0.029548
      1.000000
      0.165401
      0.167887
      -0.011163
      -0.013940
      -0.173970
      0.116147
      0.142267
    
    
      pct_women
      -0.067036
      -0.021178
      -0.144073
      0.099911
      -0.049083
      0.047587
      0.165401
      1.000000
      0.851266
      -0.207798
      -0.257510
      -0.130680
      0.685301
      0.581080
    
    
      wordratio
      -0.059694
      0.002188
      -0.125291
      0.105872
      -0.033431
      0.053322
      0.167887
      0.851266
      1.000000
      -0.191358
      -0.216788
      -0.080728
      0.608965
      0.570961
    
    
      prob_diff
      -0.032356
      -0.052633
      -0.265918
      -0.134599
      0.045883
      0.020753
      -0.011163
      -0.207798
      -0.191358
      1.000000
      0.815491
      0.441574
      -0.245904
      -0.204912
    
    
      weighted_diff
      -0.042344
      -0.054550
      -0.241877
      -0.150892
      -0.010033
      0.005668
      -0.013940
      -0.257510
      -0.216788
      0.815491
      1.000000
      0.383396
      -0.248943
      -0.228787
    
    
      prob_stdev
      -0.072736
      -0.064606
      -0.284284
      -0.122698
      -0.043694
      0.191749
      -0.173970
      -0.130680
      -0.080728
      0.441574
      0.383396
      1.000000
      -0.213965
      -0.120883
    
    
      prob_mean
      -0.013056
      0.079513
      0.030538
      0.237385
      -0.031284
      -0.001760
      0.116147
      0.685301
      0.608965
      -0.245904
      -0.248943
      -0.213965
      1.000000
      0.507696
    
    
      binaryauth
      -0.148931
      -0.053148
      -0.110871
      0.033229
      -0.134423
      0.021899
      0.142267
      0.581080
      0.570961
      -0.204912
      -0.228787
      -0.120883
      0.507696
      1.000000



In [74]:

    
authormodel = smf.ols(formula = 'weighted_diff ~ pct_women + meandate + binaryauth', data = authormeta).fit()
authormodel.summary()









    Out[74]:





OLS Regression Results

  Dep. Variable:       weighted_diff     R-squared:             0.164


  Model:                    OLS          Adj. R-squared:        0.161


  Method:              Least Squares     F-statistic:           52.30


  Date:              Thu, 20 Jul 2017    Prob (F-statistic):  7.11e-31


  Time:                  09:46:10        Log-Likelihood:       1927.6


  No. Observations:          804         AIC:                  -3847.


  Df Residuals:              800         BIC:                  -3828.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                coef      std err       t       P>|t|   [0.025     0.975]  


  Intercept       0.5127      0.050     10.273   0.000      0.415      0.611


  pct_women      -0.0479      0.008     -6.153   0.000     -0.063     -0.033


  meandate       -0.0002    2.6e-05     -8.817   0.000     -0.000     -0.000


  binaryauth     -0.0058      0.002     -2.973   0.003     -0.010     -0.002




  Omnibus:        258.429    Durbin-Watson:         2.013


  Prob(Omnibus):   0.000     Jarque-Bera (JB):   9749.763


  Skew:           -0.733     Prob(JB):               0.00


  Kurtosis:       19.997     Cond. No.           1.22e+05



In [66]:

    
authormeta[authormeta.authgender == 'm'].corr()









    Out[66]:






  
    
      
      num_stories
      reviewed
      meandate
      mean_prestige
      mean_sales
      numchars
      charsize
      pct_women
      wordratio
      prob_diff
      weighted_diff
      prob_stdev
      prob_mean
      binaryauth
    
  
  
    
      num_stories
      1.000000
      0.233313
      0.141723
      0.203661
      0.499386
      0.072355
      0.042709
      0.024341
      0.035975
      -0.086567
      -0.098265
      -0.137105
      0.092943
      NaN
    
    
      reviewed
      0.233313
      1.000000
      0.082260
      0.467277
      0.290199
      0.094755
      -0.055158
      0.103213
      0.110484
      -0.122033
      -0.118770
      -0.083917
      0.209005
      NaN
    
    
      meandate
      0.141723
      0.082260
      1.000000
      0.097814
      -0.041160
      0.112701
      -0.161474
      -0.077726
      -0.098248
      -0.304985
      -0.260813
      -0.320776
      0.147506
      NaN
    
    
      mean_prestige
      0.203661
      0.467277
      0.097814
      1.000000
      0.208443
      0.017251
      -0.097868
      0.210429
      0.186846
      -0.151419
      -0.184130
      -0.146116
      0.364310
      NaN
    
    
      mean_sales
      0.499386
      0.290199
      -0.041160
      0.208443
      1.000000
      0.070393
      0.061285
      0.044880
      0.085258
      -0.015551
      -0.080063
      -0.153395
      0.061477
      NaN
    
    
      numchars
      0.072355
      0.094755
      0.112701
      0.017251
      0.070393
      1.000000
      0.003731
      0.016576
      0.020073
      0.011869
      0.010405
      0.121162
      0.028648
      NaN
    
    
      charsize
      0.042709
      -0.055158
      -0.161474
      -0.097868
      0.061285
      0.003731
      1.000000
      0.168346
      0.153375
      0.073256
      0.054532
      -0.170386
      0.050075
      NaN
    
    
      pct_women
      0.024341
      0.103213
      -0.077726
      0.210429
      0.044880
      0.016576
      0.168346
      1.000000
      0.817833
      -0.086775
      -0.162461
      -0.140035
      0.612956
      NaN
    
    
      wordratio
      0.035975
      0.110484
      -0.098248
      0.186846
      0.085258
      0.020073
      0.153375
      0.817833
      1.000000
      -0.063072
      -0.114356
      -0.056690
      0.508932
      NaN
    
    
      prob_diff
      -0.086567
      -0.122033
      -0.304985
      -0.151419
      -0.015551
      0.011869
      0.073256
      -0.086775
      -0.063072
      1.000000
      0.831499
      0.411620
      -0.197177
      NaN
    
    
      weighted_diff
      -0.098265
      -0.118770
      -0.260813
      -0.184130
      -0.080063
      0.010405
      0.054532
      -0.162461
      -0.114356
      0.831499
      1.000000
      0.350210
      -0.189095
      NaN
    
    
      prob_stdev
      -0.137105
      -0.083917
      -0.320776
      -0.146116
      -0.153395
      0.121162
      -0.170386
      -0.140035
      -0.056690
      0.411620
      0.350210
      1.000000
      -0.232417
      NaN
    
    
      prob_mean
      0.092943
      0.209005
      0.147506
      0.364310
      0.061477
      0.028648
      0.050075
      0.612956
      0.508932
      -0.197177
      -0.189095
      -0.232417
      1.000000
      NaN
    
    
      binaryauth
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN



In [15]:

    
authormeta[authormeta.authgender == 'f'].corr()









    Out[15]:






  
    
      
      num_stories
      meandate
      mean_prestige
      mean_sales
      numchars
      charsize
      pct_women
      wordratio
      prob_diff
      weighted_diff
      prob_stdev
      prob_mean
    
  
  
    
      num_stories
      1.000000
      0.070293
      0.151998
      0.667579
      0.024708
      0.028403
      0.015314
      0.016825
      0.059267
      0.052446
      0.005806
      0.029140
    
    
      meandate
      0.070293
      1.000000
      0.237417
      -0.077555
      -0.069427
      -0.096531
      -0.097399
      -0.021226
      -0.148353
      -0.191635
      -0.187684
      -0.145022
    
    
      mean_prestige
      0.151998
      0.237417
      1.000000
      0.150619
      0.059172
      -0.153872
      -0.025213
      0.008218
      -0.003268
      -0.042805
      -0.054551
      -0.029945
    
    
      mean_sales
      0.667579
      -0.077555
      0.150619
      1.000000
      0.121972
      0.131655
      0.021562
      0.017285
      0.062698
      0.061543
      0.015039
      0.031240
    
    
      numchars
      0.024708
      -0.069427
      0.059172
      0.121972
      1.000000
      -0.117479
      0.107331
      0.127662
      0.065604
      -0.005340
      0.340781
      0.005935
    
    
      charsize
      0.028403
      -0.096531
      -0.153872
      0.131655
      -0.117479
      1.000000
      -0.000329
      0.063396
      0.011968
      -0.023379
      -0.152545
      0.086531
    
    
      pct_women
      0.015314
      -0.097399
      -0.025213
      0.021562
      0.107331
      -0.000329
      1.000000
      0.716555
      -0.176143
      -0.153280
      -0.033422
      0.497102
    
    
      wordratio
      0.016825
      -0.021226
      0.008218
      0.017285
      0.127662
      0.063396
      0.716555
      1.000000
      -0.112095
      -0.079195
      0.034334
      0.433225
    
    
      prob_diff
      0.059267
      -0.148353
      -0.003268
      0.062698
      0.065604
      0.011968
      -0.176143
      -0.112095
      1.000000
      0.762709
      0.493984
      -0.017732
    
    
      weighted_diff
      0.052446
      -0.191635
      -0.042805
      0.061543
      -0.005340
      -0.023379
      -0.153280
      -0.079195
      0.762709
      1.000000
      0.389891
      -0.024579
    
    
      prob_stdev
      0.005806
      -0.187684
      -0.054551
      0.015039
      0.340781
      -0.152545
      -0.033422
      0.034334
      0.493984
      0.389891
      1.000000
      -0.086264
    
    
      prob_mean
      0.029140
      -0.145022
      -0.029945
      0.031240
      0.005935
      0.086531
      0.497102
      0.433225
      -0.017732
      -0.024579
      -0.086264
      1.000000



In [67]:

    
authormeta[authormeta.meandate > 1899].corr()









    Out[67]:






  
    
      
      num_stories
      reviewed
      meandate
      mean_prestige
      mean_sales
      numchars
      charsize
      pct_women
      wordratio
      prob_diff
      weighted_diff
      prob_stdev
      prob_mean
      binaryauth
    
  
  
    
      num_stories
      1.000000
      0.216207
      0.028832
      0.192364
      0.552781
      0.080999
      -0.008270
      -0.020080
      -0.026239
      0.017046
      -0.017367
      -0.051526
      0.034022
      -0.154354
    
    
      reviewed
      0.216207
      1.000000
      0.118624
      0.493815
      0.253594
      0.115210
      -0.104733
      0.085850
      0.101939
      -0.043750
      -0.115606
      -0.019682
      0.201517
      -0.008467
    
    
      meandate
      0.028832
      0.118624
      1.000000
      0.144864
      0.020717
      0.285391
      -0.090324
      0.039179
      0.045959
      -0.108303
      -0.142281
      -0.097999
      0.121877
      0.032213
    
    
      mean_prestige
      0.192364
      0.493815
      0.144864
      1.000000
      0.173665
      0.077410
      -0.155905
      0.231569
      0.235436
      -0.159304
      -0.234630
      -0.102519
      0.398523
      0.089821
    
    
      mean_sales
      0.552781
      0.253594
      0.020717
      0.173665
      1.000000
      0.180814
      -0.002249
      -0.039367
      -0.009546
      0.078740
      0.000745
      -0.008500
      0.019888
      -0.124467
    
    
      numchars
      0.080999
      0.115210
      0.285391
      0.077410
      0.180814
      1.000000
      -0.157878
      0.044582
      0.041489
      0.046356
      -0.019275
      0.173617
      0.026330
      -0.053770
    
    
      charsize
      -0.008270
      -0.104733
      -0.090324
      -0.155905
      -0.002249
      -0.157878
      1.000000
      0.104278
      0.155188
      -0.017621
      -0.032118
      -0.227254
      0.167334
      0.165798
    
    
      pct_women
      -0.020080
      0.085850
      0.039179
      0.231569
      -0.039367
      0.044582
      0.104278
      1.000000
      0.874111
      -0.231020
      -0.333915
      -0.161123
      0.717269
      0.575442
    
    
      wordratio
      -0.026239
      0.101939
      0.045959
      0.235436
      -0.009546
      0.041489
      0.155188
      0.874111
      1.000000
      -0.242582
      -0.357869
      -0.159709
      0.667903
      0.590426
    
    
      prob_diff
      0.017046
      -0.043750
      -0.108303
      -0.159304
      0.078740
      0.046356
      -0.017621
      -0.231020
      -0.242582
      1.000000
      0.824731
      0.421907
      -0.249271
      -0.214917
    
    
      weighted_diff
      -0.017367
      -0.115606
      -0.142281
      -0.234630
      0.000745
      -0.019275
      -0.032118
      -0.333915
      -0.357869
      0.824731
      1.000000
      0.353381
      -0.318739
      -0.275348
    
    
      prob_stdev
      -0.051526
      -0.019682
      -0.097999
      -0.102519
      -0.008500
      0.173617
      -0.227254
      -0.161123
      -0.159709
      0.421907
      0.353381
      1.000000
      -0.229075
      -0.167212
    
    
      prob_mean
      0.034022
      0.201517
      0.121877
      0.398523
      0.019888
      0.026330
      0.167334
      0.717269
      0.667903
      -0.249271
      -0.318739
      -0.229075
      1.000000
      0.493687
    
    
      binaryauth
      -0.154354
      -0.008467
      0.032213
      0.089821
      -0.124467
      -0.053770
      0.165798
      0.575442
      0.590426
      -0.214917
      -0.275348
      -0.167212
      0.493687
      1.000000



In [31]:

    
authormeta[(authormeta.meandate > 1920) & (authormeta.authgender == 'm')].corr()









    Out[31]:






  
    
      
      num_stories
      meandate
      mean_prestige
      mean_sales
      numchars
      charsize
      pct_women
      wordratio
      prob_diff
      prob_stdev
      prob_mean
    
  
  
    
      num_stories
      1.000000
      -0.065257
      0.285475
      0.508761
      0.040698
      0.056041
      0.067147
      0.109698
      -0.024528
      -0.027788
      0.131550
    
    
      meandate
      -0.065257
      1.000000
      0.134988
      -0.046628
      0.194593
      -0.133596
      0.003914
      0.032082
      -0.207532
      -0.019468
      0.019617
    
    
      mean_prestige
      0.285475
      0.134988
      1.000000
      0.257232
      0.022541
      -0.085217
      0.396409
      0.379550
      -0.156451
      -0.044407
      0.511218
    
    
      mean_sales
      0.508761
      -0.046628
      0.257232
      1.000000
      0.194626
      0.013702
      0.062069
      0.175694
      -0.071750
      0.089569
      0.169740
    
    
      numchars
      0.040698
      0.194593
      0.022541
      0.194626
      1.000000
      -0.125138
      0.012126
      0.053806
      0.037956
      0.340389
      -0.028815
    
    
      charsize
      0.056041
      -0.133596
      -0.085217
      0.013702
      -0.125138
      1.000000
      0.106259
      0.095825
      0.038795
      -0.217598
      0.103200
    
    
      pct_women
      0.067147
      0.003914
      0.396409
      0.062069
      0.012126
      0.106259
      1.000000
      0.866698
      -0.146658
      -0.082226
      0.659543
    
    
      wordratio
      0.109698
      0.032082
      0.379550
      0.175694
      0.053806
      0.095825
      0.866698
      1.000000
      -0.182480
      -0.094531
      0.604170
    
    
      prob_diff
      -0.024528
      -0.207532
      -0.156451
      -0.071750
      0.037956
      0.038795
      -0.146658
      -0.182480
      1.000000
      0.382050
      -0.108680
    
    
      prob_stdev
      -0.027788
      -0.019468
      -0.044407
      0.089569
      0.340389
      -0.217598
      -0.082226
      -0.094531
      0.382050
      1.000000
      -0.126740
    
    
      prob_mean
      0.131550
      0.019617
      0.511218
      0.169740
      -0.028815
      0.103200
      0.659543
      0.604170
      -0.108680
      -0.126740
      1.000000



In [17]:

    
otherauthor = pd.read_csv('pairedwithprestige.csv')



In [18]:

    
def trim_to_24(aname):
    if type(aname) != str:
        return 'Anonymous'

    aname = aname.strip('(),. .[0123456789]')
    if len (aname) > 24:
        return aname[0:24]
    else:
        return aname

other_author = set(otherauthor.author.apply(trim_to_24))



In [19]:

    
print(other_author - set(authormeta.author))









    



{'Overstolz, Marie Emelie ', 'Leigh, Alfred', 'Montagu, Lily H', 'Grey', 'Elton, Arthur Hallam', 'Andrews, Anabel (Follanb', 'Holyoke, Hetty', 'Pardoe', 'Fogerty, J', 'O. Douglas', 'Chatterji, Bankim Chandr', 'Post, Helen (Wilmans', 'Maria', 'Engles, William M', 'Johnston, Sir Harry', 'Aytoun, William Edmondst', 'Ingram, J. Forsyth', 'Lean, Florence', 'Newall, John', 'Vereker, Charles Smyth', 'Yale, Catharine Brooks', 'Harbert, Lizzie Boynton', 'Goff, H. N. K', 'McLain, Mary Webster', 'Hoffman, Mary J', 'Christie-Murray, David', 'Newell, Charles Martin', 'Rex, Beach', 'Leonowens, Anna Harriett', 'Hannay, James', 'Chittenden, L. E', 'Radecliffe, Noell', 'Glenn, Isa', 'Perelaer, Michael Theoph', 'Veitch, Sophie F. F', 'Reddin, Kenneth', 'Buckley, William', 'Smith, Francis Hopkinson', 'Watson, William', 'Spencer, Lillian', 'Volckhausen, Adeline', 'Colvill, Helen Hester', 'Swift, John Franklin', 'Perry, Alice', 'Bradford, O. K', 'Aïdé, Hamilton', 'Smythies, Harriet Maria ', 'Châteauclair, Wilfrid', 'Edwards, Matilda Betham', 'Scott, Geo. G', 'Zack', 'Valentine, L', 'Estvan, Mathilde', 'Adderley, James Granvill', 'Fox, Richard A', 'Smith, William', 'Goulding, F. R', 'Pomeroy, John', 'Rives, Hallie', 'Conybeare, William John', 'anonymous', 'Houstoun'}



In [35]:

    
print(set(authormeta.author) - other_author)









    



set()



In [68]:

    
genremeta = pd.read_csv('output/genre_storymeta.tsv', sep = '\t')
genremeta.head()









    Out[68]:






  
    
      
      docid
      author
      title
      authgender
      pubdate
      genre
      numchars
      charsize
      pct_women
      wordratio
      prob_diff
      weighted_diff
      prob_stdev
      prob_mean
    
  
  
    
      0
      uc1.32106011196133
      Heinlein, Robert A.
      Starship troopers
      m
      1959
      scifi
      29
      58.862069
      0.153846
      0.048659
      0.002086
      0.045627
      0.049584
      0.478987
    
    
      1
      8469
      Brontë, Emily
      Wuthering Heights
      f
      1847
      historical
      32
      421.593750
      0.433333
      0.443459
      0.013174
      0.036966
      0.063450
      0.491630
    
    
      2
      10651
      Austen, Jane
      Pride and Prejudice
      f
      1813
      romance
      42
      346.928571
      0.714286
      0.640862
      0.068306
      0.058855
      0.063728
      0.509821
    
    
      3
      mdp.39015034269400
      Leonard, Elmore,
      Riding the rap
      m
      1995
      detective
      25
      360.520000
      0.250000
      0.153210
      0.027186
      0.070882
      0.058211
      0.496419
    
    
      4
      mdp.39015063511748
      Berkeley, Anthony,
      The poisoned chocolates c
      m
      1929
      detective
      19
      314.842105
      0.500000
      0.243402
      0.037797
      0.043456
      0.042703
      0.486050



In [69]:

    
grouped = genremeta[genremeta.pubdate > 1900].groupby(['genre', 'authgender'])
genreavg = grouped.aggregate(np.mean)
genreavg









    Out[69]:






  
    
      
      
      pubdate
      numchars
      charsize
      pct_women
      wordratio
      prob_diff
      weighted_diff
      prob_stdev
      prob_mean
    
    
      genre
      authgender
      
      
      
      
      
      
      
      
      
    
  
  
    
      detective
      f
      1940.500000
      36.166667
      204.851237
      0.358923
      0.385034
      0.043938
      0.046218
      0.063361
      0.481287
    
    
      m
      1942.319149
      31.680851
      194.327318
      0.253157
      0.200722
      0.048396
      0.056077
      0.057398
      0.483436
    
    
      u
      1994.000000
      73.000000
      330.356164
      0.308824
      0.484858
      0.034783
      0.011604
      0.055324
      0.479031
    
    
      romance
      f
      1947.375000
      42.875000
      214.062008
      0.428041
      0.518899
      0.036215
      0.033877
      0.057505
      0.512110
    
    
      scifi
      f
      1981.333333
      43.888889
      189.844360
      0.347950
      0.329206
      0.018747
      0.029946
      0.053799
      0.488649
    
    
      m
      1953.883333
      24.866667
      243.646166
      0.220020
      0.214166
      0.034972
      0.035996
      0.053741
      0.480353
    
    
      u
      1990.000000
      41.000000
      214.048780
      0.206897
      0.136118
      -0.032973
      -0.018758
      0.056144
      0.475397
    
    
      western
      m
      1935.454545
      23.454545
      251.631583
      0.185665
      0.202370
      0.061239
      0.057680
      0.052471
      0.452596



In [7]:

    
genremeta.corr()









    Out[7]:






  
    
      
      pubdate
      numchars
      charsize
      pct_women
      wordratio
      prob_diff
      weighted_diff
      prob_stdev
      prob_mean
    
  
  
    
      pubdate
      1.000000
      0.087487
      -0.047805
      -0.203835
      -0.179622
      -0.290928
      -0.315559
      -0.284285
      -0.037632
    
    
      numchars
      0.087487
      1.000000
      -0.116675
      0.191617
      0.213407
      -0.070770
      -0.087493
      0.118414
      0.095820
    
    
      charsize
      -0.047805
      -0.116675
      1.000000
      0.083578
      0.045047
      -0.029180
      -0.024740
      -0.246300
      0.179546
    
    
      pct_women
      -0.203835
      0.191617
      0.083578
      1.000000
      0.786075
      -0.104107
      -0.100844
      0.065427
      0.561660
    
    
      wordratio
      -0.179622
      0.213407
      0.045047
      0.786075
      1.000000
      -0.037690
      -0.101150
      0.058492
      0.463602
    
    
      prob_diff
      -0.290928
      -0.070770
      -0.029180
      -0.104107
      -0.037690
      1.000000
      0.714091
      0.483968
      -0.107519
    
    
      weighted_diff
      -0.315559
      -0.087493
      -0.024740
      -0.100844
      -0.101150
      0.714091
      1.000000
      0.383959
      -0.084137
    
    
      prob_stdev
      -0.284285
      0.118414
      -0.246300
      0.065427
      0.058492
      0.483968
      0.383959
      1.000000
      -0.068062
    
    
      prob_mean
      -0.037632
      0.095820
      0.179546
      0.561660
      0.463602
      -0.107519
      -0.084137
      -0.068062
      1.000000



In [35]:

    
def after1900(date):
    if date < 1900:
        return 0
    else:
        return 1

authormeta['century'] = authormeta.meandate.apply(after1900)
    
grouped = authormeta.groupby(['century', 'authgender'])
authoravg = grouped.aggregate(np.mean)
authoravg









    Out[35]:






  
    
      
      
      num_stories
      reviewed
      meandate
      mean_prestige
      mean_sales
      numchars
      charsize
      pct_women
      wordratio
      prob_diff
      weighted_diff
      prob_stdev
      prob_mean
    
    
      century
      authgender
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      f
      8.447514
      0.397790
      1877.738949
      0.478870
      0.495331
      33.654888
      232.038433
      0.437561
      0.485522
      0.055409
      0.056436
      0.066894
      0.500467
    
    
      m
      11.641860
      0.483721
      1878.095249
      0.481033
      0.581735
      30.629130
      214.476230
      0.292293
      0.296706
      0.068334
      0.068545
      0.069302
      0.480436
    
    
      u
      1.909091
      0.227273
      1871.607792
      0.363867
      0.248613
      32.154329
      204.644244
      0.395810
      0.413640
      0.061165
      0.068886
      0.064771
      0.487076
    
    
      1
      f
      9.563910
      0.496241
      1930.825268
      0.534538
      0.441817
      31.713569
      224.399359
      0.423866
      0.483194
      0.047738
      0.047177
      0.059574
      0.501259
    
    
      m
      15.491039
      0.501792
      1929.428001
      0.496075
      0.520256
      33.337437
      190.748340
      0.275554
      0.270708
      0.057418
      0.059390
      0.063218
      0.484201
    
    
      u
      5.000000
      0.392857
      1927.096812
      0.520311
      0.229838
      30.158466
      194.269540
      0.315528
      0.343114
      0.046244
      0.051748
      0.058423
      0.489751



In [29]:

    
authormeta.plot.scatter(x = 'meandate', y = 'weighted_diff')









    Out[29]:





<matplotlib.axes._subplots.AxesSubplot at 0x11003a898>



In [ ]:

	docid	volid	recordid	author	firstname	inferreddate	birthdate	authgender	enumcron	title
0	14930	uva.x004123163	NaN	Swift, Jonathan,	Jonathan	1784	NaN	m	v.1	The works of the Rev. Dr. Jonathan Swift
1	14931	uva.x004123168	NaN	Swift, Jonathan,	Jonathan	1784	NaN	m	v.6	The works of the Rev. Dr. Jonathan Swift
2	14932	uva.x030576706	NaN	Swift, Jonathan,	Jonathan	1784	NaN	m	v.11	The works of the Rev. Dr. Jonathan Swift
3	14933	uva.x000530839	NaN	Swift, Jonathan,	Jonathan	1784	NaN	m	v.12	The works of the Rev. Dr. Jonathan Swift
4	14934	nyp.33433076096019	NaN	Swift, Jonathan,	Jonathan	1784	NaN	m	v. 14	The works of the Rev. Dr. Jonathan Swift

	charid	gender	pubdate	numwords	probability
0	0\|Betsey	f	1891	334	0.462642
1	0\|Phil	m	1891	12	0.140581
2	0\|Elizabeth	f	1891	82	0.366735
3	0\|Mr.Jones	m	1891	526	0.553426
4	0\|Mr.Mitford	m	1891	14	0.386104

	probability	pubdate
gender
f	0.547890	1946.901165
m	0.438264	1948.086980
u	0.473054	1955.238318

	author	num_stories	reviewed	authgender	meandate	mean_prestige	mean_sales	numchars	charsize	pct_women	wordratio	prob_diff	weighted_diff	prob_stdev	prob_mean
0	Beckett, Samuel	13	1	m	1966.307692	0.820090	0.362205	13.846154	79.398322	0.368146	0.309553	0.052216	0.057434	0.045178	0.505375
1	Haggard, H. Rider	17	1	m	1898.235294	0.534054	0.899441	18.117647	343.630334	0.281828	0.369872	0.044399	0.053472	0.055781	0.481296
2	Castlemon, Harry	30	0	m	1886.433333	0.195307	0.777778	22.466667	322.592651	0.066558	0.030604	0.085897	0.078611	0.058242	0.445997
3	Pidgin, Charles Felton	12	0	m	1905.166667	0.179293	0.508571	40.000000	228.147965	0.291458	0.272297	0.072691	0.080473	0.065104	0.481867
4	Lewis, Wyndham	15	1	m	1945.533333	0.697906	0.453704	31.000000	202.572051	0.191909	0.172247	0.024949	0.030662	0.050687	0.487141

	num_stories	reviewed	meandate	mean_prestige	mean_sales	numchars	charsize	pct_women	wordratio	prob_diff	weighted_diff	prob_stdev	prob_mean	binaryauth
num_stories	1.000000	0.218885	0.136339	0.185862	0.546059	0.063713	0.014521	-0.067036	-0.059694	-0.032356	-0.042344	-0.072736	-0.013056	-0.148931
reviewed	0.218885	1.000000	0.136778	0.476159	0.288765	0.043027	-0.059852	-0.021178	0.002188	-0.052633	-0.054550	-0.064606	0.079513	-0.053148
meandate	0.136339	0.136778	1.000000	0.164638	-0.029856	0.043027	-0.143136	-0.144073	-0.125291	-0.265918	-0.241877	-0.284284	0.030538	-0.110871
mean_prestige	0.185862	0.476159	0.164638	1.000000	0.187538	0.031609	-0.118692	0.099911	0.105872	-0.134599	-0.150892	-0.122698	0.237385	0.033229
mean_sales	0.546059	0.288765	-0.029856	0.187538	1.000000	0.087558	0.066422	-0.049083	-0.033431	0.045883	-0.010033	-0.043694	-0.031284	-0.134423
numchars	0.063713	0.043027	0.043027	0.031609	0.087558	1.000000	-0.029548	0.047587	0.053322	0.020753	0.005668	0.191749	-0.001760	0.021899
charsize	0.014521	-0.059852	-0.143136	-0.118692	0.066422	-0.029548	1.000000	0.165401	0.167887	-0.011163	-0.013940	-0.173970	0.116147	0.142267
pct_women	-0.067036	-0.021178	-0.144073	0.099911	-0.049083	0.047587	0.165401	1.000000	0.851266	-0.207798	-0.257510	-0.130680	0.685301	0.581080
wordratio	-0.059694	0.002188	-0.125291	0.105872	-0.033431	0.053322	0.167887	0.851266	1.000000	-0.191358	-0.216788	-0.080728	0.608965	0.570961
prob_diff	-0.032356	-0.052633	-0.265918	-0.134599	0.045883	0.020753	-0.011163	-0.207798	-0.191358	1.000000	0.815491	0.441574	-0.245904	-0.204912
weighted_diff	-0.042344	-0.054550	-0.241877	-0.150892	-0.010033	0.005668	-0.013940	-0.257510	-0.216788	0.815491	1.000000	0.383396	-0.248943	-0.228787
prob_stdev	-0.072736	-0.064606	-0.284284	-0.122698	-0.043694	0.191749	-0.173970	-0.130680	-0.080728	0.441574	0.383396	1.000000	-0.213965	-0.120883
prob_mean	-0.013056	0.079513	0.030538	0.237385	-0.031284	-0.001760	0.116147	0.685301	0.608965	-0.245904	-0.248943	-0.213965	1.000000	0.507696
binaryauth	-0.148931	-0.053148	-0.110871	0.033229	-0.134423	0.021899	0.142267	0.581080	0.570961	-0.204912	-0.228787	-0.120883	0.507696	1.000000

Dep. Variable:	weighted_diff	R-squared:	0.164
Model:	OLS	Adj. R-squared:	0.161
Method:	Least Squares	F-statistic:	52.30
Date:	Thu, 20 Jul 2017	Prob (F-statistic):	7.11e-31
Time:	09:46:10	Log-Likelihood:	1927.6
No. Observations:	804	AIC:	-3847.
Df Residuals:	800	BIC:	-3828.
Df Model:	3
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	0.5127	0.050	10.273	0.000	0.415	0.611
pct_women	-0.0479	0.008	-6.153	0.000	-0.063	-0.033
meandate	-0.0002	2.6e-05	-8.817	0.000	-0.000	-0.000
binaryauth	-0.0058	0.002	-2.973	0.003	-0.010	-0.002

Omnibus:	258.429	Durbin-Watson:	2.013
Prob(Omnibus):	0.000	Jarque-Bera (JB):	9749.763
Skew:	-0.733	Prob(JB):	0.00
Kurtosis:	19.997	Cond. No.	1.22e+05

	num_stories	reviewed	meandate	mean_prestige	mean_sales	numchars	charsize	pct_women	wordratio	prob_diff	weighted_diff	prob_stdev	prob_mean	binaryauth
num_stories	1.000000	0.233313	0.141723	0.203661	0.499386	0.072355	0.042709	0.024341	0.035975	-0.086567	-0.098265	-0.137105	0.092943	NaN
reviewed	0.233313	1.000000	0.082260	0.467277	0.290199	0.094755	-0.055158	0.103213	0.110484	-0.122033	-0.118770	-0.083917	0.209005	NaN
meandate	0.141723	0.082260	1.000000	0.097814	-0.041160	0.112701	-0.161474	-0.077726	-0.098248	-0.304985	-0.260813	-0.320776	0.147506	NaN
mean_prestige	0.203661	0.467277	0.097814	1.000000	0.208443	0.017251	-0.097868	0.210429	0.186846	-0.151419	-0.184130	-0.146116	0.364310	NaN
mean_sales	0.499386	0.290199	-0.041160	0.208443	1.000000	0.070393	0.061285	0.044880	0.085258	-0.015551	-0.080063	-0.153395	0.061477	NaN
numchars	0.072355	0.094755	0.112701	0.017251	0.070393	1.000000	0.003731	0.016576	0.020073	0.011869	0.010405	0.121162	0.028648	NaN
charsize	0.042709	-0.055158	-0.161474	-0.097868	0.061285	0.003731	1.000000	0.168346	0.153375	0.073256	0.054532	-0.170386	0.050075	NaN
pct_women	0.024341	0.103213	-0.077726	0.210429	0.044880	0.016576	0.168346	1.000000	0.817833	-0.086775	-0.162461	-0.140035	0.612956	NaN
wordratio	0.035975	0.110484	-0.098248	0.186846	0.085258	0.020073	0.153375	0.817833	1.000000	-0.063072	-0.114356	-0.056690	0.508932	NaN
prob_diff	-0.086567	-0.122033	-0.304985	-0.151419	-0.015551	0.011869	0.073256	-0.086775	-0.063072	1.000000	0.831499	0.411620	-0.197177	NaN
weighted_diff	-0.098265	-0.118770	-0.260813	-0.184130	-0.080063	0.010405	0.054532	-0.162461	-0.114356	0.831499	1.000000	0.350210	-0.189095	NaN
prob_stdev	-0.137105	-0.083917	-0.320776	-0.146116	-0.153395	0.121162	-0.170386	-0.140035	-0.056690	0.411620	0.350210	1.000000	-0.232417	NaN
prob_mean	0.092943	0.209005	0.147506	0.364310	0.061477	0.028648	0.050075	0.612956	0.508932	-0.197177	-0.189095	-0.232417	1.000000	NaN
binaryauth	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	num_stories	meandate	mean_prestige	mean_sales	numchars	charsize	pct_women	wordratio	prob_diff	weighted_diff	prob_stdev	prob_mean
num_stories	1.000000	0.070293	0.151998	0.667579	0.024708	0.028403	0.015314	0.016825	0.059267	0.052446	0.005806	0.029140
meandate	0.070293	1.000000	0.237417	-0.077555	-0.069427	-0.096531	-0.097399	-0.021226	-0.148353	-0.191635	-0.187684	-0.145022
mean_prestige	0.151998	0.237417	1.000000	0.150619	0.059172	-0.153872	-0.025213	0.008218	-0.003268	-0.042805	-0.054551	-0.029945
mean_sales	0.667579	-0.077555	0.150619	1.000000	0.121972	0.131655	0.021562	0.017285	0.062698	0.061543	0.015039	0.031240
numchars	0.024708	-0.069427	0.059172	0.121972	1.000000	-0.117479	0.107331	0.127662	0.065604	-0.005340	0.340781	0.005935
charsize	0.028403	-0.096531	-0.153872	0.131655	-0.117479	1.000000	-0.000329	0.063396	0.011968	-0.023379	-0.152545	0.086531
pct_women	0.015314	-0.097399	-0.025213	0.021562	0.107331	-0.000329	1.000000	0.716555	-0.176143	-0.153280	-0.033422	0.497102
wordratio	0.016825	-0.021226	0.008218	0.017285	0.127662	0.063396	0.716555	1.000000	-0.112095	-0.079195	0.034334	0.433225
prob_diff	0.059267	-0.148353	-0.003268	0.062698	0.065604	0.011968	-0.176143	-0.112095	1.000000	0.762709	0.493984	-0.017732
weighted_diff	0.052446	-0.191635	-0.042805	0.061543	-0.005340	-0.023379	-0.153280	-0.079195	0.762709	1.000000	0.389891	-0.024579
prob_stdev	0.005806	-0.187684	-0.054551	0.015039	0.340781	-0.152545	-0.033422	0.034334	0.493984	0.389891	1.000000	-0.086264
prob_mean	0.029140	-0.145022	-0.029945	0.031240	0.005935	0.086531	0.497102	0.433225	-0.017732	-0.024579	-0.086264	1.000000

	num_stories	reviewed	meandate	mean_prestige	mean_sales	numchars	charsize	pct_women	wordratio	prob_diff	weighted_diff	prob_stdev	prob_mean	binaryauth
num_stories	1.000000	0.216207	0.028832	0.192364	0.552781	0.080999	-0.008270	-0.020080	-0.026239	0.017046	-0.017367	-0.051526	0.034022	-0.154354
reviewed	0.216207	1.000000	0.118624	0.493815	0.253594	0.115210	-0.104733	0.085850	0.101939	-0.043750	-0.115606	-0.019682	0.201517	-0.008467
meandate	0.028832	0.118624	1.000000	0.144864	0.020717	0.285391	-0.090324	0.039179	0.045959	-0.108303	-0.142281	-0.097999	0.121877	0.032213
mean_prestige	0.192364	0.493815	0.144864	1.000000	0.173665	0.077410	-0.155905	0.231569	0.235436	-0.159304	-0.234630	-0.102519	0.398523	0.089821
mean_sales	0.552781	0.253594	0.020717	0.173665	1.000000	0.180814	-0.002249	-0.039367	-0.009546	0.078740	0.000745	-0.008500	0.019888	-0.124467
numchars	0.080999	0.115210	0.285391	0.077410	0.180814	1.000000	-0.157878	0.044582	0.041489	0.046356	-0.019275	0.173617	0.026330	-0.053770
charsize	-0.008270	-0.104733	-0.090324	-0.155905	-0.002249	-0.157878	1.000000	0.104278	0.155188	-0.017621	-0.032118	-0.227254	0.167334	0.165798
pct_women	-0.020080	0.085850	0.039179	0.231569	-0.039367	0.044582	0.104278	1.000000	0.874111	-0.231020	-0.333915	-0.161123	0.717269	0.575442
wordratio	-0.026239	0.101939	0.045959	0.235436	-0.009546	0.041489	0.155188	0.874111	1.000000	-0.242582	-0.357869	-0.159709	0.667903	0.590426
prob_diff	0.017046	-0.043750	-0.108303	-0.159304	0.078740	0.046356	-0.017621	-0.231020	-0.242582	1.000000	0.824731	0.421907	-0.249271	-0.214917
weighted_diff	-0.017367	-0.115606	-0.142281	-0.234630	0.000745	-0.019275	-0.032118	-0.333915	-0.357869	0.824731	1.000000	0.353381	-0.318739	-0.275348
prob_stdev	-0.051526	-0.019682	-0.097999	-0.102519	-0.008500	0.173617	-0.227254	-0.161123	-0.159709	0.421907	0.353381	1.000000	-0.229075	-0.167212
prob_mean	0.034022	0.201517	0.121877	0.398523	0.019888	0.026330	0.167334	0.717269	0.667903	-0.249271	-0.318739	-0.229075	1.000000	0.493687
binaryauth	-0.154354	-0.008467	0.032213	0.089821	-0.124467	-0.053770	0.165798	0.575442	0.590426	-0.214917	-0.275348	-0.167212	0.493687	1.000000

	num_stories	meandate	mean_prestige	mean_sales	numchars	charsize	pct_women	wordratio	prob_diff	prob_stdev	prob_mean
num_stories	1.000000	-0.065257	0.285475	0.508761	0.040698	0.056041	0.067147	0.109698	-0.024528	-0.027788	0.131550
meandate	-0.065257	1.000000	0.134988	-0.046628	0.194593	-0.133596	0.003914	0.032082	-0.207532	-0.019468	0.019617
mean_prestige	0.285475	0.134988	1.000000	0.257232	0.022541	-0.085217	0.396409	0.379550	-0.156451	-0.044407	0.511218
mean_sales	0.508761	-0.046628	0.257232	1.000000	0.194626	0.013702	0.062069	0.175694	-0.071750	0.089569	0.169740
numchars	0.040698	0.194593	0.022541	0.194626	1.000000	-0.125138	0.012126	0.053806	0.037956	0.340389	-0.028815
charsize	0.056041	-0.133596	-0.085217	0.013702	-0.125138	1.000000	0.106259	0.095825	0.038795	-0.217598	0.103200
pct_women	0.067147	0.003914	0.396409	0.062069	0.012126	0.106259	1.000000	0.866698	-0.146658	-0.082226	0.659543
wordratio	0.109698	0.032082	0.379550	0.175694	0.053806	0.095825	0.866698	1.000000	-0.182480	-0.094531	0.604170
prob_diff	-0.024528	-0.207532	-0.156451	-0.071750	0.037956	0.038795	-0.146658	-0.182480	1.000000	0.382050	-0.108680
prob_stdev	-0.027788	-0.019468	-0.044407	0.089569	0.340389	-0.217598	-0.082226	-0.094531	0.382050	1.000000	-0.126740
prob_mean	0.131550	0.019617	0.511218	0.169740	-0.028815	0.103200	0.659543	0.604170	-0.108680	-0.126740	1.000000

	docid	author	title	authgender	pubdate	genre	numchars	charsize	pct_women	wordratio	prob_diff	weighted_diff	prob_stdev	prob_mean
0	uc1.32106011196133	Heinlein, Robert A.	Starship troopers	m	1959	scifi	29	58.862069	0.153846	0.048659	0.002086	0.045627	0.049584	0.478987
1	8469	Brontë, Emily	Wuthering Heights	f	1847	historical	32	421.593750	0.433333	0.443459	0.013174	0.036966	0.063450	0.491630
2	10651	Austen, Jane	Pride and Prejudice	f	1813	romance	42	346.928571	0.714286	0.640862	0.068306	0.058855	0.063728	0.509821
3	mdp.39015034269400	Leonard, Elmore,	Riding the rap	m	1995	detective	25	360.520000	0.250000	0.153210	0.027186	0.070882	0.058211	0.496419
4	mdp.39015063511748	Berkeley, Anthony,	The poisoned chocolates c	m	1929	detective	19	314.842105	0.500000	0.243402	0.037797	0.043456	0.042703	0.486050

		pubdate	numchars	charsize	pct_women	wordratio	prob_diff	weighted_diff	prob_stdev	prob_mean
genre	authgender
detective	f	1940.500000	36.166667	204.851237	0.358923	0.385034	0.043938	0.046218	0.063361	0.481287
	m	1942.319149	31.680851	194.327318	0.253157	0.200722	0.048396	0.056077	0.057398	0.483436
	u	1994.000000	73.000000	330.356164	0.308824	0.484858	0.034783	0.011604	0.055324	0.479031
romance	f	1947.375000	42.875000	214.062008	0.428041	0.518899	0.036215	0.033877	0.057505	0.512110
scifi	f	1981.333333	43.888889	189.844360	0.347950	0.329206	0.018747	0.029946	0.053799	0.488649
	m	1953.883333	24.866667	243.646166	0.220020	0.214166	0.034972	0.035996	0.053741	0.480353
	u	1990.000000	41.000000	214.048780	0.206897	0.136118	-0.032973	-0.018758	0.056144	0.475397
western	m	1935.454545	23.454545	251.631583	0.185665	0.202370	0.061239	0.057680	0.052471	0.452596

	pubdate	numchars	charsize	pct_women	wordratio	prob_diff	weighted_diff	prob_stdev	prob_mean
pubdate	1.000000	0.087487	-0.047805	-0.203835	-0.179622	-0.290928	-0.315559	-0.284285	-0.037632
numchars	0.087487	1.000000	-0.116675	0.191617	0.213407	-0.070770	-0.087493	0.118414	0.095820
charsize	-0.047805	-0.116675	1.000000	0.083578	0.045047	-0.029180	-0.024740	-0.246300	0.179546
pct_women	-0.203835	0.191617	0.083578	1.000000	0.786075	-0.104107	-0.100844	0.065427	0.561660
wordratio	-0.179622	0.213407	0.045047	0.786075	1.000000	-0.037690	-0.101150	0.058492	0.463602
prob_diff	-0.290928	-0.070770	-0.029180	-0.104107	-0.037690	1.000000	0.714091	0.483968	-0.107519
weighted_diff	-0.315559	-0.087493	-0.024740	-0.100844	-0.101150	0.714091	1.000000	0.383959	-0.084137
prob_stdev	-0.284285	0.118414	-0.246300	0.065427	0.058492	0.483968	0.383959	1.000000	-0.068062
prob_mean	-0.037632	0.095820	0.179546	0.561660	0.463602	-0.107519	-0.084137	-0.068062	1.000000

		num_stories	reviewed	meandate	mean_prestige	mean_sales	numchars	charsize	pct_women	wordratio	prob_diff	weighted_diff	prob_stdev	prob_mean
century	authgender
0	f	8.447514	0.397790	1877.738949	0.478870	0.495331	33.654888	232.038433	0.437561	0.485522	0.055409	0.056436	0.066894	0.500467
	m	11.641860	0.483721	1878.095249	0.481033	0.581735	30.629130	214.476230	0.292293	0.296706	0.068334	0.068545	0.069302	0.480436
	u	1.909091	0.227273	1871.607792	0.363867	0.248613	32.154329	204.644244	0.395810	0.413640	0.061165	0.068886	0.064771	0.487076
1	f	9.563910	0.496241	1930.825268	0.534538	0.441817	31.713569	224.399359	0.423866	0.483194	0.047738	0.047177	0.059574	0.501259
	m	15.491039	0.501792	1929.428001	0.496075	0.520256	33.337437	190.748340	0.275554	0.270708	0.057418	0.059390	0.063218	0.484201
	u	5.000000	0.392857	1927.096812	0.520311	0.229838	30.158466	194.269540	0.315528	0.343114	0.046244	0.051748	0.058423	0.489751