notebook.community

Edit and run



In [1]:

    
##############################
#Author Skye Ouyang
#Date: 24th Apr
##############################
import pandas as pd
import numpy as np
from pandas import DataFrame
import sklearn
import requests
import StringIO
import json
import re



In [2]:

    
base_path='./'
whole_data=DataFrame.from_csv(base_path+'Text Analytics.csv',index_col='BOOK ID')
whole_data









    Out[2]:






  
    
      
      Label
      Avg_sen_len
      Blank
      Avg_num_word
      Avg_word_len
      num_of_weapon
      num_of_bloody
      num_of_mysterious
      num_of_PopWord
      Exclamation_Ratio
      Ques_Mark_Ratio
      Period_Ratio
      Stopwords_Ratio
      Avg_sentiment
    
    
      BOOK ID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      1
      66.21
      13.50
      15.13
      4.38
      34
      61
      35
      30747
      0.001428
      0.070812
      0.927761
      0.431888
      -0.004217
    
    
      2
      1
      53.26
      11.76
      13.30
      4.00
      21
      41
      18
      14681
      0.004876
      0.082004
      0.913121
      0.409934
      -0.005893
    
    
      3
      1
      60.19
      12.39
      15.62
      3.85
      17
      34
      15
      14669
      0.021185
      0.127748
      0.851067
      0.376801
      -0.005749
    
    
      4
      1
      52.72
      10.56
      12.32
      4.28
      18
      31
      19
      10815
      0.037219
      0.131997
      0.830785
      0.403794
      -0.003993
    
    
      5
      1
      82.84
      19.73
      21.67
      3.82
      18
      16
      11
      13146
      0.001936
      0.077184
      0.920881
      0.445443
      -0.003967
    
    
      6
      1
      78.68
      17.08
      19.38
      4.06
      37
      74
      46
      62775
      0.039811
      0.094483
      0.865706
      0.449838
      -0.002384
    
    
      7
      1
      100.66
      21.16
      24.28
      4.15
      26
      69
      37
      28809
      0.045664
      0.085841
      0.868496
      0.434448
      -0.003804
    
    
      8
      1
      50.32
      10.91
      13.31
      3.78
      33
      53
      22
      26800
      0.002528
      0.086703
      0.910769
      0.402746
      -0.004073
    
    
      9
      1
      83.64
      18.06
      20.42
      4.10
      19
      36
      21
      7448
      0.025133
      0.061502
      0.913365
      0.444903
      -0.007596
    
    
      11
      1
      108.08
      21.29
      25.04
      4.32
      28
      64
      41
      15372
      0.005491
      0.021223
      0.973286
      0.363366
      -0.005909
    
    
      12
      1
      55.07
      11.42
      14.28
      3.86
      31
      45
      36
      37852
      0.006527
      0.101500
      0.891973
      0.378242
      -0.002677
    
    
      13
      1
      55.43
      11.27
      13.04
      4.25
      23
      51
      32
      19667
      0.002848
      0.119022
      0.878130
      0.389047
      -0.005003
    
    
      14
      1
      47.81
      10.12
      11.63
      4.11
      24
      44
      22
      13350
      0.008105
      0.108889
      0.883007
      0.395945
      -0.007379
    
    
      15
      1
      54.25
      11.20
      13.03
      4.16
      19
      35
      23
      13986
      0.027626
      0.132134
      0.840240
      0.402049
      -0.004922
    
    
      16
      1
      49.55
      10.02
      11.62
      4.27
      31
      54
      30
      20512
      0.024403
      0.071559
      0.904038
      0.380537
      -0.004479
    
    
      17
      1
      49.90
      10.30
      12.96
      3.85
      19
      38
      23
      17698
      0.002038
      0.131960
      0.866002
      0.390458
      -0.005908
    
    
      19
      1
      48.47
      10.24
      11.75
      4.12
      19
      34
      13
      14747
      0.001867
      0.138056
      0.860077
      0.409527
      -0.006761
    
    
      21
      1
      54.84
      11.39
      13.03
      4.21
      21
      59
      31
      18827
      0.008961
      0.132456
      0.858583
      0.418912
      -0.006739
    
    
      22
      1
      47.45
      9.88
      12.47
      3.80
      21
      29
      19
      14796
      0.002936
      0.139375
      0.857690
      0.381034
      -0.005744
    
    
      23
      1
      83.58
      18.14
      20.68
      4.04
      36
      68
      39
      37032
      0.042359
      0.096669
      0.860972
      0.445314
      -0.003014
    
    
      24
      1
      56.23
      11.70
      14.50
      3.88
      47
      68
      45
      25212
      0.009069
      0.088477
      0.902454
      0.378405
      -0.004760
    
    
      25
      1
      62.53
      12.76
      14.25
      4.39
      34
      47
      32
      32662
      0.000765
      0.083484
      0.915752
      0.435393
      -0.003651
    
    
      26
      1
      56.91
      11.51
      13.38
      4.25
      17
      33
      21
      13110
      0.030593
      0.130057
      0.839350
      0.390497
      -0.004099
    
    
      27
      1
      80.83
      17.08
      20.34
      3.97
      24
      46
      30
      35965
      0.071162
      0.087031
      0.841807
      0.424724
      -0.002683
    
    
      28
      1
      56.31
      11.66
      14.59
      3.86
      33
      63
      36
      23725
      0.017976
      0.075566
      0.906458
      0.371894
      -0.005111
    
    
      30
      1
      63.32
      13.42
      16.34
      3.88
      33
      57
      37
      20774
      0.005794
      0.082254
      0.911952
      0.393344
      -0.005641
    
    
      31
      1
      59.90
      12.70
      15.41
      3.89
      22
      33
      18
      19612
      0.012434
      0.100391
      0.887175
      0.382525
      -0.003405
    
    
      32
      1
      88.37
      18.47
      20.95
      4.22
      26
      51
      32
      19529
      0.008701
      0.140993
      0.850306
      0.421528
      -0.006533
    
    
      33
      1
      84.31
      17.21
      20.60
      4.09
      45
      75
      40
      88656
      0.093831
      0.083238
      0.822931
      0.413722
      -0.001785
    
    
      34
      1
      66.21
      12.53
      15.12
      4.38
      29
      49
      47
      22661
      0.029045
      0.088927
      0.882028
      0.358528
      -0.004122
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      816
      0
      105.78
      20.92
      24.09
      4.39
      20
      45
      32
      21616
      0.011321
      0.057714
      0.930966
      0.405790
      -0.005171
    
    
      818
      0
      72.88
      15.26
      17.51
      4.16
      21
      44
      33
      16644
      0.003178
      0.121469
      0.875353
      0.407512
      -0.003924
    
    
      819
      0
      51.25
      11.19
      12.77
      4.01
      27
      29
      16
      15698
      0.000698
      0.079407
      0.919895
      0.401947
      -0.006857
    
    
      822
      0
      50.31
      10.84
      12.52
      4.02
      23
      32
      11
      20876
      0.001383
      0.109286
      0.889331
      0.395386
      -0.005711
    
    
      826
      0
      66.02
      13.64
      15.36
      4.30
      34
      41
      24
      13239
      0.006555
      0.116445
      0.877000
      0.414128
      -0.004404
    
    
      828
      0
      67.67
      14.96
      17.01
      3.98
      17
      31
      20
      12801
      0.032057
      0.145448
      0.822495
      0.405323
      -0.005043
    
    
      832
      0
      62.73
      13.17
      17.02
      3.69
      23
      40
      23
      17942
      0.004077
      0.112054
      0.883870
      0.363192
      -0.004725
    
    
      834
      0
      67.65
      14.30
      16.10
      4.20
      27
      39
      24
      13029
      0.001299
      0.097178
      0.901523
      0.417177
      -0.005183
    
    
      836
      0
      63.27
      13.23
      15.10
      4.19
      23
      47
      25
      15349
      0.000767
      0.110376
      0.888857
      0.407069
      -0.004114
    
    
      839
      0
      59.67
      12.45
      14.24
      4.19
      19
      47
      28
      13891
      0.003133
      0.134724
      0.862143
      0.404630
      -0.004949
    
    
      844
      0
      55.33
      11.45
      13.35
      4.14
      28
      35
      15
      13309
      0.003425
      0.126202
      0.870373
      0.397424
      -0.006587
    
    
      850
      0
      66.13
      13.41
      15.74
      4.20
      23
      46
      36
      11411
      0.005469
      0.101492
      0.893039
      0.411550
      -0.007386
    
    
      854
      0
      101.99
      21.96
      24.94
      4.09
      26
      40
      29
      15100
      0.000913
      0.112962
      0.886125
      0.419969
      -0.006389
    
    
      855
      0
      73.59
      15.47
      17.77
      4.14
      32
      58
      35
      15917
      0.007117
      0.105749
      0.887134
      0.411086
      -0.006587
    
    
      860
      0
      82.29
      16.58
      18.98
      4.33
      23
      65
      43
      18326
      0.034692
      0.056580
      0.908728
      0.411588
      -0.004099
    
    
      864
      0
      46.52
      9.51
      11.18
      4.16
      21
      39
      34
      13860
      0.001450
      0.099573
      0.898978
      0.385226
      -0.005997
    
    
      866
      0
      65.69
      13.94
      16.02
      4.10
      14
      19
      21
      31326
      0.013528
      0.101190
      0.885281
      0.433487
      -0.000601
    
    
      868
      0
      68.44
      14.50
      16.66
      4.11
      12
      18
      21
      18371
      0.010949
      0.079423
      0.909628
      0.438282
      -0.005570
    
    
      870
      0
      70.24
      14.84
      17.03
      4.13
      13
      20
      19
      15384
      0.008680
      0.073779
      0.917541
      0.443674
      -0.004892
    
    
      872
      0
      69.87
      15.12
      17.13
      4.08
      14
      19
      22
      23359
      0.011550
      0.077670
      0.910780
      0.462138
      -0.004629
    
    
      874
      0
      69.98
      15.07
      17.05
      4.10
      12
      26
      24
      5759
      0.009580
      0.075818
      0.914602
      0.460665
      0.000059
    
    
      878
      0
      63.52
      12.84
      15.76
      4.03
      27
      41
      18
      17251
      0.019949
      0.079616
      0.900435
      0.361699
      -0.004511
    
    
      879
      0
      68.38
      14.30
      16.28
      4.20
      25
      53
      34
      8067
      0.015775
      0.081354
      0.902871
      0.427792
      -0.003822
    
    
      881
      0
      80.02
      17.41
      19.77
      4.05
      28
      45
      30
      15554
      0.000974
      0.103408
      0.895618
      0.426759
      -0.005783
    
    
      894
      0
      64.10
      14.09
      16.24
      3.95
      23
      32
      17
      12164
      0.008230
      0.115984
      0.875785
      0.408528
      -0.005740
    
    
      897
      0
      56.35
      11.20
      12.88
      4.38
      15
      10
      4
      12478
      0.015719
      0.093549
      0.890732
      0.049587
      -0.004804
    
    
      902
      0
      57.97
      11.90
      13.85
      4.19
      31
      38
      32
      11962
      0.007213
      0.080013
      0.912773
      0.399001
      -0.006421
    
    
      903
      0
      84.74
      17.61
      21.04
      4.03
      28
      63
      32
      3833
      0.008932
      0.176357
      0.814711
      0.357600
      -0.002973
    
    
      906
      0
      99.52
      20.60
      24.56
      4.05
      14
      9
      9
      45029
      0.070019
      0.012519
      0.917462
      0.248194
      -0.003369
    
    
      908
      0
      65.26
      13.98
      16.27
      4.01
      26
      31
      18
      33519
      0.002062
      0.084654
      0.913284
      0.379555
      -0.003820
    
  

200 rows × 14 columns



In [3]:

    
whole_data.head()









    Out[3]:






  
    
      
      Label
      Avg_sen_len
      Blank
      Avg_num_word
      Avg_word_len
      num_of_weapon
      num_of_bloody
      num_of_mysterious
      num_of_PopWord
      Exclamation_Ratio
      Ques_Mark_Ratio
      Period_Ratio
      Stopwords_Ratio
      Avg_sentiment
    
    
      BOOK ID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      1
      66.21
      13.50
      15.13
      4.38
      34
      61
      35
      30747
      0.001428
      0.070812
      0.927761
      0.431888
      -0.004217
    
    
      2
      1
      53.26
      11.76
      13.30
      4.00
      21
      41
      18
      14681
      0.004876
      0.082004
      0.913121
      0.409934
      -0.005893
    
    
      3
      1
      60.19
      12.39
      15.62
      3.85
      17
      34
      15
      14669
      0.021185
      0.127748
      0.851067
      0.376801
      -0.005749
    
    
      4
      1
      52.72
      10.56
      12.32
      4.28
      18
      31
      19
      10815
      0.037219
      0.131997
      0.830785
      0.403794
      -0.003993
    
    
      5
      1
      82.84
      19.73
      21.67
      3.82
      18
      16
      11
      13146
      0.001936
      0.077184
      0.920881
      0.445443
      -0.003967



In [4]:

    
whole_data.describe()









    Out[4]:






  
    
      
      Label
      Avg_sen_len
      Blank
      Avg_num_word
      Avg_word_len
      num_of_weapon
      num_of_bloody
      num_of_mysterious
      num_of_PopWord
      Exclamation_Ratio
      Ques_Mark_Ratio
      Period_Ratio
      Stopwords_Ratio
      Avg_sentiment
    
  
  
    
      count
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
    
    
      mean
      0.500000
      64.510850
      13.480000
      15.734350
      4.098200
      26.610000
      43.670000
      26.240000
      21469.220000
      0.016519
      0.105939
      0.877542
      0.394061
      -0.004905
    
    
      std
      0.501255
      13.181462
      2.745602
      3.065132
      0.165033
      7.931104
      14.264941
      9.525021
      13997.464919
      0.020156
      0.030764
      0.036451
      0.054025
      0.001592
    
    
      min
      0.000000
      43.770000
      8.670000
      10.810000
      3.660000
      6.000000
      5.000000
      1.000000
      3240.000000
      0.000000
      0.002588
      0.753951
      0.049587
      -0.013573
    
    
      25%
      0.000000
      55.367500
      11.562500
      13.535000
      4.010000
      21.750000
      34.000000
      20.750000
      14303.500000
      0.003989
      0.083422
      0.854945
      0.383138
      -0.005741
    
    
      50%
      0.500000
      61.850000
      12.855000
      15.110000
      4.100000
      26.000000
      44.000000
      25.000000
      18220.500000
      0.008691
      0.106156
      0.881681
      0.398482
      -0.004863
    
    
      75%
      1.000000
      69.995000
      14.817500
      17.130000
      4.190000
      32.000000
      52.000000
      32.000000
      24242.250000
      0.021320
      0.125323
      0.902412
      0.416372
      -0.004017
    
    
      max
      1.000000
      128.800000
      25.760000
      27.960000
      4.610000
      52.000000
      92.000000
      55.000000
      107384.000000
      0.127455
      0.194009
      0.994177
      0.462138
      0.000059



In [6]:

    
Whole_data=(whole_data-whole_data.min())/(whole_data.max()-whole_data.min())
whole_data









    Out[6]:






  
    
      
      Label
      Avg_sen_len
      Blank
      Avg_num_word
      Avg_word_len
      num_of_weapon
      num_of_bloody
      num_of_mysterious
      num_of_PopWord
      Exclamation_Ratio
      Ques_Mark_Ratio
      Period_Ratio
      Stopwords_Ratio
      Avg_sentiment
    
    
      BOOK ID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      1
      66.21
      13.50
      15.13
      4.38
      34
      61
      35
      30747
      0.001428
      0.070812
      0.927761
      0.431888
      -0.004217
    
    
      2
      1
      53.26
      11.76
      13.30
      4.00
      21
      41
      18
      14681
      0.004876
      0.082004
      0.913121
      0.409934
      -0.005893
    
    
      3
      1
      60.19
      12.39
      15.62
      3.85
      17
      34
      15
      14669
      0.021185
      0.127748
      0.851067
      0.376801
      -0.005749
    
    
      4
      1
      52.72
      10.56
      12.32
      4.28
      18
      31
      19
      10815
      0.037219
      0.131997
      0.830785
      0.403794
      -0.003993
    
    
      5
      1
      82.84
      19.73
      21.67
      3.82
      18
      16
      11
      13146
      0.001936
      0.077184
      0.920881
      0.445443
      -0.003967
    
    
      6
      1
      78.68
      17.08
      19.38
      4.06
      37
      74
      46
      62775
      0.039811
      0.094483
      0.865706
      0.449838
      -0.002384
    
    
      7
      1
      100.66
      21.16
      24.28
      4.15
      26
      69
      37
      28809
      0.045664
      0.085841
      0.868496
      0.434448
      -0.003804
    
    
      8
      1
      50.32
      10.91
      13.31
      3.78
      33
      53
      22
      26800
      0.002528
      0.086703
      0.910769
      0.402746
      -0.004073
    
    
      9
      1
      83.64
      18.06
      20.42
      4.10
      19
      36
      21
      7448
      0.025133
      0.061502
      0.913365
      0.444903
      -0.007596
    
    
      11
      1
      108.08
      21.29
      25.04
      4.32
      28
      64
      41
      15372
      0.005491
      0.021223
      0.973286
      0.363366
      -0.005909
    
    
      12
      1
      55.07
      11.42
      14.28
      3.86
      31
      45
      36
      37852
      0.006527
      0.101500
      0.891973
      0.378242
      -0.002677
    
    
      13
      1
      55.43
      11.27
      13.04
      4.25
      23
      51
      32
      19667
      0.002848
      0.119022
      0.878130
      0.389047
      -0.005003
    
    
      14
      1
      47.81
      10.12
      11.63
      4.11
      24
      44
      22
      13350
      0.008105
      0.108889
      0.883007
      0.395945
      -0.007379
    
    
      15
      1
      54.25
      11.20
      13.03
      4.16
      19
      35
      23
      13986
      0.027626
      0.132134
      0.840240
      0.402049
      -0.004922
    
    
      16
      1
      49.55
      10.02
      11.62
      4.27
      31
      54
      30
      20512
      0.024403
      0.071559
      0.904038
      0.380537
      -0.004479
    
    
      17
      1
      49.90
      10.30
      12.96
      3.85
      19
      38
      23
      17698
      0.002038
      0.131960
      0.866002
      0.390458
      -0.005908
    
    
      19
      1
      48.47
      10.24
      11.75
      4.12
      19
      34
      13
      14747
      0.001867
      0.138056
      0.860077
      0.409527
      -0.006761
    
    
      21
      1
      54.84
      11.39
      13.03
      4.21
      21
      59
      31
      18827
      0.008961
      0.132456
      0.858583
      0.418912
      -0.006739
    
    
      22
      1
      47.45
      9.88
      12.47
      3.80
      21
      29
      19
      14796
      0.002936
      0.139375
      0.857690
      0.381034
      -0.005744
    
    
      23
      1
      83.58
      18.14
      20.68
      4.04
      36
      68
      39
      37032
      0.042359
      0.096669
      0.860972
      0.445314
      -0.003014
    
    
      24
      1
      56.23
      11.70
      14.50
      3.88
      47
      68
      45
      25212
      0.009069
      0.088477
      0.902454
      0.378405
      -0.004760
    
    
      25
      1
      62.53
      12.76
      14.25
      4.39
      34
      47
      32
      32662
      0.000765
      0.083484
      0.915752
      0.435393
      -0.003651
    
    
      26
      1
      56.91
      11.51
      13.38
      4.25
      17
      33
      21
      13110
      0.030593
      0.130057
      0.839350
      0.390497
      -0.004099
    
    
      27
      1
      80.83
      17.08
      20.34
      3.97
      24
      46
      30
      35965
      0.071162
      0.087031
      0.841807
      0.424724
      -0.002683
    
    
      28
      1
      56.31
      11.66
      14.59
      3.86
      33
      63
      36
      23725
      0.017976
      0.075566
      0.906458
      0.371894
      -0.005111
    
    
      30
      1
      63.32
      13.42
      16.34
      3.88
      33
      57
      37
      20774
      0.005794
      0.082254
      0.911952
      0.393344
      -0.005641
    
    
      31
      1
      59.90
      12.70
      15.41
      3.89
      22
      33
      18
      19612
      0.012434
      0.100391
      0.887175
      0.382525
      -0.003405
    
    
      32
      1
      88.37
      18.47
      20.95
      4.22
      26
      51
      32
      19529
      0.008701
      0.140993
      0.850306
      0.421528
      -0.006533
    
    
      33
      1
      84.31
      17.21
      20.60
      4.09
      45
      75
      40
      88656
      0.093831
      0.083238
      0.822931
      0.413722
      -0.001785
    
    
      34
      1
      66.21
      12.53
      15.12
      4.38
      29
      49
      47
      22661
      0.029045
      0.088927
      0.882028
      0.358528
      -0.004122
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      816
      0
      105.78
      20.92
      24.09
      4.39
      20
      45
      32
      21616
      0.011321
      0.057714
      0.930966
      0.405790
      -0.005171
    
    
      818
      0
      72.88
      15.26
      17.51
      4.16
      21
      44
      33
      16644
      0.003178
      0.121469
      0.875353
      0.407512
      -0.003924
    
    
      819
      0
      51.25
      11.19
      12.77
      4.01
      27
      29
      16
      15698
      0.000698
      0.079407
      0.919895
      0.401947
      -0.006857
    
    
      822
      0
      50.31
      10.84
      12.52
      4.02
      23
      32
      11
      20876
      0.001383
      0.109286
      0.889331
      0.395386
      -0.005711
    
    
      826
      0
      66.02
      13.64
      15.36
      4.30
      34
      41
      24
      13239
      0.006555
      0.116445
      0.877000
      0.414128
      -0.004404
    
    
      828
      0
      67.67
      14.96
      17.01
      3.98
      17
      31
      20
      12801
      0.032057
      0.145448
      0.822495
      0.405323
      -0.005043
    
    
      832
      0
      62.73
      13.17
      17.02
      3.69
      23
      40
      23
      17942
      0.004077
      0.112054
      0.883870
      0.363192
      -0.004725
    
    
      834
      0
      67.65
      14.30
      16.10
      4.20
      27
      39
      24
      13029
      0.001299
      0.097178
      0.901523
      0.417177
      -0.005183
    
    
      836
      0
      63.27
      13.23
      15.10
      4.19
      23
      47
      25
      15349
      0.000767
      0.110376
      0.888857
      0.407069
      -0.004114
    
    
      839
      0
      59.67
      12.45
      14.24
      4.19
      19
      47
      28
      13891
      0.003133
      0.134724
      0.862143
      0.404630
      -0.004949
    
    
      844
      0
      55.33
      11.45
      13.35
      4.14
      28
      35
      15
      13309
      0.003425
      0.126202
      0.870373
      0.397424
      -0.006587
    
    
      850
      0
      66.13
      13.41
      15.74
      4.20
      23
      46
      36
      11411
      0.005469
      0.101492
      0.893039
      0.411550
      -0.007386
    
    
      854
      0
      101.99
      21.96
      24.94
      4.09
      26
      40
      29
      15100
      0.000913
      0.112962
      0.886125
      0.419969
      -0.006389
    
    
      855
      0
      73.59
      15.47
      17.77
      4.14
      32
      58
      35
      15917
      0.007117
      0.105749
      0.887134
      0.411086
      -0.006587
    
    
      860
      0
      82.29
      16.58
      18.98
      4.33
      23
      65
      43
      18326
      0.034692
      0.056580
      0.908728
      0.411588
      -0.004099
    
    
      864
      0
      46.52
      9.51
      11.18
      4.16
      21
      39
      34
      13860
      0.001450
      0.099573
      0.898978
      0.385226
      -0.005997
    
    
      866
      0
      65.69
      13.94
      16.02
      4.10
      14
      19
      21
      31326
      0.013528
      0.101190
      0.885281
      0.433487
      -0.000601
    
    
      868
      0
      68.44
      14.50
      16.66
      4.11
      12
      18
      21
      18371
      0.010949
      0.079423
      0.909628
      0.438282
      -0.005570
    
    
      870
      0
      70.24
      14.84
      17.03
      4.13
      13
      20
      19
      15384
      0.008680
      0.073779
      0.917541
      0.443674
      -0.004892
    
    
      872
      0
      69.87
      15.12
      17.13
      4.08
      14
      19
      22
      23359
      0.011550
      0.077670
      0.910780
      0.462138
      -0.004629
    
    
      874
      0
      69.98
      15.07
      17.05
      4.10
      12
      26
      24
      5759
      0.009580
      0.075818
      0.914602
      0.460665
      0.000059
    
    
      878
      0
      63.52
      12.84
      15.76
      4.03
      27
      41
      18
      17251
      0.019949
      0.079616
      0.900435
      0.361699
      -0.004511
    
    
      879
      0
      68.38
      14.30
      16.28
      4.20
      25
      53
      34
      8067
      0.015775
      0.081354
      0.902871
      0.427792
      -0.003822
    
    
      881
      0
      80.02
      17.41
      19.77
      4.05
      28
      45
      30
      15554
      0.000974
      0.103408
      0.895618
      0.426759
      -0.005783
    
    
      894
      0
      64.10
      14.09
      16.24
      3.95
      23
      32
      17
      12164
      0.008230
      0.115984
      0.875785
      0.408528
      -0.005740
    
    
      897
      0
      56.35
      11.20
      12.88
      4.38
      15
      10
      4
      12478
      0.015719
      0.093549
      0.890732
      0.049587
      -0.004804
    
    
      902
      0
      57.97
      11.90
      13.85
      4.19
      31
      38
      32
      11962
      0.007213
      0.080013
      0.912773
      0.399001
      -0.006421
    
    
      903
      0
      84.74
      17.61
      21.04
      4.03
      28
      63
      32
      3833
      0.008932
      0.176357
      0.814711
      0.357600
      -0.002973
    
    
      906
      0
      99.52
      20.60
      24.56
      4.05
      14
      9
      9
      45029
      0.070019
      0.012519
      0.917462
      0.248194
      -0.003369
    
    
      908
      0
      65.26
      13.98
      16.27
      4.01
      26
      31
      18
      33519
      0.002062
      0.084654
      0.913284
      0.379555
      -0.003820
    
  

200 rows × 14 columns



In [7]:

    
Whole_data.describe()









    Out[7]:






  
    
      
      Label
      Avg_sen_len
      Blank
      Avg_num_word
      Avg_word_len
      num_of_weapon
      num_of_bloody
      num_of_mysterious
      num_of_PopWord
      Exclamation_Ratio
      Ques_Mark_Ratio
      Period_Ratio
      Stopwords_Ratio
      Avg_sentiment
    
  
  
    
      count
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
    
    
      mean
      0.500000
      0.243924
      0.281451
      0.287134
      0.461263
      0.448043
      0.444483
      0.467407
      0.175039
      0.129606
      0.539917
      0.514478
      0.834986
      0.635879
    
    
      std
      0.501255
      0.155021
      0.160655
      0.178725
      0.173719
      0.172415
      0.163965
      0.176389
      0.134405
      0.158138
      0.160715
      0.151734
      0.130953
      0.116756
    
    
      min
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.000000
      0.136393
      0.169251
      0.158892
      0.368421
      0.342391
      0.333333
      0.365741
      0.106233
      0.031299
      0.422285
      0.420415
      0.808510
      0.574520
    
    
      50%
      0.500000
      0.212631
      0.244880
      0.250729
      0.463158
      0.434783
      0.448276
      0.444444
      0.143844
      0.068185
      0.541049
      0.531708
      0.845703
      0.638899
    
    
      75%
      1.000000
      0.308421
      0.359713
      0.368513
      0.557895
      0.565217
      0.540230
      0.574074
      0.201665
      0.167277
      0.641179
      0.618006
      0.889067
      0.701004
    
    
      max
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000



In [8]:

    
corrmat = Whole_data.corr()
corrmat









    Out[8]:






  
    
      
      Label
      Avg_sen_len
      Blank
      Avg_num_word
      Avg_word_len
      num_of_weapon
      num_of_bloody
      num_of_mysterious
      num_of_PopWord
      Exclamation_Ratio
      Ques_Mark_Ratio
      Period_Ratio
      Stopwords_Ratio
      Avg_sentiment
    
  
  
    
      Label
      1.000000
      -0.011724
      -0.008069
      0.017057
      -0.165229
      0.144099
      0.080820
      0.081043
      0.025593
      0.088009
      -0.134481
      0.064837
      -0.020507
      -0.048014
    
    
      Avg_sen_len
      -0.011724
      1.000000
      0.984967
      0.979242
      0.251266
      -0.031755
      0.166153
      0.265984
      0.130755
      0.247384
      -0.320978
      0.134113
      -0.019227
      0.142893
    
    
      Blank
      -0.008069
      0.984967
      1.000000
      0.985378
      0.152948
      -0.052497
      0.128518
      0.231573
      0.115210
      0.201513
      -0.314627
      0.154117
      0.048923
      0.150920
    
    
      Avg_num_word
      0.017057
      0.979242
      0.985378
      1.000000
      0.055390
      -0.053303
      0.137360
      0.229812
      0.116027
      0.248042
      -0.281382
      0.100331
      0.007334
      0.155410
    
    
      Avg_word_len
      -0.165229
      0.251266
      0.152948
      0.055390
      1.000000
      0.115597
      0.187885
      0.255186
      0.115421
      0.083783
      -0.216982
      0.136805
      -0.123713
      0.002357
    
    
      num_of_weapon
      0.144099
      -0.031755
      -0.052497
      -0.053303
      0.115597
      1.000000
      0.674252
      0.537257
      0.193223
      -0.031040
      0.099642
      -0.066934
      0.190961
      0.026211
    
    
      num_of_bloody
      0.080820
      0.166153
      0.128518
      0.137360
      0.187885
      0.674252
      1.000000
      0.797364
      0.236442
      0.110291
      0.054063
      -0.106615
      0.287134
      0.076268
    
    
      num_of_mysterious
      0.081043
      0.265984
      0.231573
      0.229812
      0.255186
      0.537257
      0.797364
      1.000000
      0.223923
      0.082695
      -0.090107
      0.030324
      0.331356
      0.129078
    
    
      num_of_PopWord
      0.025593
      0.130755
      0.115210
      0.116027
      0.115421
      0.193223
      0.236442
      0.223923
      1.000000
      0.065132
      -0.089374
      0.039416
      0.072630
      0.531727
    
    
      Exclamation_Ratio
      0.088009
      0.247384
      0.201513
      0.248042
      0.083783
      -0.031040
      0.110291
      0.082695
      0.065132
      1.000000
      -0.019385
      -0.536594
      -0.199896
      0.088052
    
    
      Ques_Mark_Ratio
      -0.134481
      -0.320978
      -0.314627
      -0.281382
      -0.216982
      0.099642
      0.054063
      -0.090107
      -0.089374
      -0.019385
      1.000000
      -0.833280
      0.073163
      -0.043373
    
    
      Period_Ratio
      0.064837
      0.134113
      0.154117
      0.100331
      0.136805
      -0.066934
      -0.106615
      0.030324
      0.039416
      -0.536594
      -0.833280
      1.000000
      0.048785
      -0.012082
    
    
      Stopwords_Ratio
      -0.020507
      -0.019227
      0.048923
      0.007334
      -0.123713
      0.190961
      0.287134
      0.331356
      0.072630
      -0.199896
      0.073163
      0.048785
      1.000000
      0.037057
    
    
      Avg_sentiment
      -0.048014
      0.142893
      0.150920
      0.155410
      0.002357
      0.026211
      0.076268
      0.129078
      0.531727
      0.088052
      -0.043373
      -0.012082
      0.037057
      1.000000



In [9]:

    
import seaborn as sns
import matplotlib.pyplot as plt
# set up the matplotlib figure
f, ax=plt.subplots(figsize=(12,9))
# draw the heatmap using seaborn
sns.heatmap(corrmat,vmax=.8,square=True)
plt.show()



In [10]:

    
c = Whole_data.corr().abs()
s = c.unstack()
so = s.order(kind='quicksort')
so[so > 0.9 ]









    



C:\Users\Skyeo\AppData\Local\Enthought\Canopy\User\lib\site-packages\ipykernel\__main__.py:3: FutureWarning: order is deprecated, use sort_values(...)
  app.launch_new_instance()






    Out[10]:





Avg_sen_len        Avg_num_word         0.979242
Avg_num_word       Avg_sen_len          0.979242
Avg_sen_len        Blank                0.984967
Blank              Avg_sen_len          0.984967
Avg_num_word       Blank                0.985378
Blank              Avg_num_word         0.985378
Label              Label                1.000000
Period_Ratio       Period_Ratio         1.000000
Ques_Mark_Ratio    Ques_Mark_Ratio      1.000000
Exclamation_Ratio  Exclamation_Ratio    1.000000
num_of_PopWord     num_of_PopWord       1.000000
num_of_mysterious  num_of_mysterious    1.000000
num_of_bloody      num_of_bloody        1.000000
num_of_weapon      num_of_weapon        1.000000
Avg_word_len       Avg_word_len         1.000000
Avg_num_word       Avg_num_word         1.000000
Blank              Blank                1.000000
Avg_sen_len        Avg_sen_len          1.000000
Stopwords_Ratio    Stopwords_Ratio      1.000000
Avg_sentiment      Avg_sentiment        1.000000
dtype: float64

separate independent and dependent variable for regression



In [11]:

    
data_y = Whole_data['Label']
data_X = Whole_data.ix[:,Whole_data.columns.difference(['Label','Avg_sen_len','Blank'])]
print data_y.shape
print data_X.shape
print data_y.dtype
print data_X.dtypes









    



(200L,)
(200, 11)
float64
Avg_num_word         float64
Avg_sentiment        float64
Avg_word_len         float64
Exclamation_Ratio    float64
Period_Ratio         float64
Ques_Mark_Ratio      float64
Stopwords_Ratio      float64
num_of_PopWord       float64
num_of_bloody        float64
num_of_mysterious    float64
num_of_weapon        float64
dtype: object



In [12]:

    
data_X.describe()









    Out[12]:






  
    
      
      Avg_num_word
      Avg_sentiment
      Avg_word_len
      Exclamation_Ratio
      Period_Ratio
      Ques_Mark_Ratio
      Stopwords_Ratio
      num_of_PopWord
      num_of_bloody
      num_of_mysterious
      num_of_weapon
    
  
  
    
      count
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
      200.000000
    
    
      mean
      0.287134
      0.635879
      0.461263
      0.129606
      0.514478
      0.539917
      0.834986
      0.175039
      0.444483
      0.467407
      0.448043
    
    
      std
      0.178725
      0.116756
      0.173719
      0.158138
      0.151734
      0.160715
      0.130953
      0.134405
      0.163965
      0.176389
      0.172415
    
    
      min
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.158892
      0.574520
      0.368421
      0.031299
      0.420415
      0.422285
      0.808510
      0.106233
      0.333333
      0.365741
      0.342391
    
    
      50%
      0.250729
      0.638899
      0.463158
      0.068185
      0.531708
      0.541049
      0.845703
      0.143844
      0.448276
      0.444444
      0.434783
    
    
      75%
      0.368513
      0.701004
      0.557895
      0.167277
      0.618006
      0.641179
      0.889067
      0.201665
      0.540230
      0.574074
      0.565217
    
    
      max
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000



In [13]:

    
from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import auc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier



In [14]:

    
#prepare models
X = data_X
Y = data_y
seed=5
models = []
models.append(('LR', LogisticRegression()))
models.append(('RF', RandomForestClassifier()))
models.append(('GB',GradientBoostingClassifier()))
models.append(('NN', MLPClassifier()))



In [15]:

    
#evaluate the models
results = []
names = []
scoring = 'accuracy'
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=seed)
	cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)









    



LR: 0.280000 (0.112250)
RF: 0.605000 (0.078899)
GB: 0.595000 (0.090692)






    



C:\Users\Skyeo\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\neural_network\multilayer_perceptron.py:563: ConvergenceWarning: Stochastic Optimizer: Maximum iterations reached and the optimization hasn't converged yet.
  % (), ConvergenceWarning)






    



NN: 0.210000 (0.181384)



In [16]:

    
fig = plt.figure()
fig.suptitle('Model Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()



In [17]:

    
#split training and test datasets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size = 0.3, random_state = 0)
print("X_train : " + str(X_train.shape))
print("X_test : " + str(X_test.shape))
print("y_train : " + str(y_train.shape))
print("y_test : " + str(y_test.shape))









    



X_train : (140, 11)
X_test : (60, 11)
y_train : (140L,)
y_test : (60L,)






    



C:\Users\Skyeo\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [21]:

    
#test the impact of tree size on prediction accuracy
trees = range(25)
accuracy = np.zeros(25)
for idx in range(len(trees)):
    classifier=RandomForestClassifier(n_estimators=idx + 1)
    classifier=classifier.fit(X_train,y_train)
    predictions=classifier.predict(X_test)
    accuracy[idx]=accuracy_score(y_test,predictions)



In [22]:

    
#plot the effect of tree size
plt.cla()
plt.plot(trees,accuracy)
plt.show()



In [50]:

    
#fit in random forest argorithm
print'Random Forest'
rf = RandomForestClassifier(n_estimators=17,min_samples_leaf=1)
print 'Fitting model'
rf_fit=rf.fit(X_train,y_train)
print 'Predcting on test set'
y_pred_rf =rf.predict(X_test)
print confusion_matrix(y_test,y_pred_rf)
print accuracy_score(y_test,y_pred_rf)
print f1_score(y_test,y_pred_rf)









    



Random Forest
Fitting model
Predcting on test set
[[24  7]
 [11 18]]
0.7
0.666666666667



In [51]:

    
#fit in GradientBoosting
print'Gradient Boosting'
gb = GradientBoostingClassifier(n_estimators=250,learning_rate=0.05, max_depth=10, max_features = 0.8, min_samples_leaf=4, random_state=0,subsample =0.9)
print 'Fitting model'
gb.fit(X_train,y_train)
print 'Predicting on test set'
y_pred_gb = gb.predict(X_test)
print confusion_matrix(y_test,y_pred_gb)
print accuracy_score(y_test,y_pred_gb)
print f1_score(y_test,y_pred_gb)









    



Gradient Boosting
Fitting model
Predicting on test set
[[20 11]
 [ 5 24]]
0.733333333333
0.75



In [52]:

    
fpr_rf, tpr_rf, _ = metrics.roc_curve(y_test, y_pred_rf)
fpr_gb,tpr_gb, _= metrics.roc_curve(y_test, y_pred_gb)
plt.plot(fpr_rf,tpr_rf)
plt.plot(fpr_gb,tpr_gb,color='orange')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.title('ROC curve')
plt.xlabel('False Positive Rate(1-Specificity)')
plt.ylabel('True Positive Rate(Sensitivity)')
plt.grid(True)
plt.plot((0.0,1.0),(0.0,1.0),color='grey',linewidth=1,linestyle='--')
plt.show()
print ('Auc for random forest is ' + str(auc(fpr_rf, tpr_rf)))
print ('Auc for Gradient Boosting is ' + str(auc(fpr_gb,tpr_gb)))









    












    



Auc for random forest is 0.69744160178
Auc for Gradient Boosting is 0.73637374861



In [53]:

    
# get the feature importance 
names = data_X.columns.values
print "Features sorted by their score"
print sorted(zip(map(lambda x: round(x,4),rf.feature_importances_),names),reverse=True)









    



Features sorted by their score
[(0.1712, 'Avg_word_len'), (0.1175, 'num_of_weapon'), (0.0985, 'Avg_num_word'), (0.0926, 'num_of_mysterious'), (0.0923, 'num_of_PopWord'), (0.0821, 'Period_Ratio'), (0.0791, 'Ques_Mark_Ratio'), (0.0751, 'Exclamation_Ratio'), (0.0677, 'num_of_bloody'), (0.0645, 'Stopwords_Ratio'), (0.0594, 'Avg_sentiment')]



In [54]:

    
#create parameter for plot
importances=rf.feature_importances_
indices = np.argsort(importances)[::-1]
std = np.std([tree.feature_importances_ for tree in rf.estimators_],axis=0)
print importances
print indices
print std









    



[ 0.09854367  0.05937063  0.1711861   0.07506371  0.08212499  0.07908636
  0.06450674  0.09231495  0.06771732  0.09260681  0.11747871]
[ 2 10  0  9  7  4  5  3  8  6  1]
[ 0.06185107  0.04254249  0.07439091  0.03337515  0.07368179  0.05610773
  0.05238061  0.06657811  0.0539612   0.04544956  0.08297826]



In [55]:

    
# Plot the feature importances of random forest
import matplotlib.pyplot as plt
plt.figure()
plt.title("Feature importances")
plt.bar(range(data_X.shape[1]), importances[indices],
       color="r", yerr=std[indices],align="center")
plt.xticks(range(data_X.shape[1]), indices)
plt.xlim([-1, data_X.shape[1]])
plt.show()



In [56]:

    
log_loss(y_test,y_pred_rf,normalize=True)









    Out[56]:





10.36172620484006



In [57]:

    
def score(sss,yyy):
    i=0
    compare = list()
    while i< len(sss):
        if sss[i] == yyy[i]:
            compare.append(1)
        else: 
            compare.append(0)
        i+=1
    return(compare)



In [58]:

    
ytest=Whole_data['Label'].values



In [59]:

    
c=score(y_pred_rf,ytest)



In [60]:

    
float(sum(c))/float(len(c))









    Out[60]:





0.4166666666666667



In [61]:

    
print 'GradientBoosting'
gb = GradientBoostingClassifier()
print 'Fitting model'
gb.fit(X_train,y_train)
print 'Predicting on test set'
y_pred_gb = gb.predict(X_test)









    



GradientBoosting
Fitting model
Predicting on test set



In [63]:

    
from sklearn.metrics import precision_score,recall_score, confusion_matrix, classification_report,accuracy_score, f1_score
print 'Accuracy:', accuracy_score(y_test, y_pred_rf)
print 'F1 score:', f1_score(y_test, y_pred_rf)
print 'Recall:', recall_score(y_test, y_pred_rf)
print 'Precision:', precision_score(y_test, y_pred_rf)
print '\n clasification report:\n', classification_report(y_test,y_pred_rf)
print '\n confussion matrix:\n',confusion_matrix(y_test, y_pred_rf)









    



 Accuracy: 0.7
F1 score: 0.666666666667
Recall: 0.620689655172
Precision: 0.72

 clasification report:
             precision    recall  f1-score   support

        0.0       0.69      0.77      0.73        31
        1.0       0.72      0.62      0.67        29

avg / total       0.70      0.70      0.70        60


 confussion matrix:
[[24  7]
 [11 18]]

	Label	Avg_sen_len	Blank	Avg_num_word	Avg_word_len	num_of_weapon	num_of_bloody	num_of_mysterious	num_of_PopWord	Exclamation_Ratio	Ques_Mark_Ratio	Period_Ratio	Stopwords_Ratio	Avg_sentiment
BOOK ID
1	1	66.21	13.50	15.13	4.38	34	61	35	30747	0.001428	0.070812	0.927761	0.431888	-0.004217
2	1	53.26	11.76	13.30	4.00	21	41	18	14681	0.004876	0.082004	0.913121	0.409934	-0.005893
3	1	60.19	12.39	15.62	3.85	17	34	15	14669	0.021185	0.127748	0.851067	0.376801	-0.005749
4	1	52.72	10.56	12.32	4.28	18	31	19	10815	0.037219	0.131997	0.830785	0.403794	-0.003993
5	1	82.84	19.73	21.67	3.82	18	16	11	13146	0.001936	0.077184	0.920881	0.445443	-0.003967
6	1	78.68	17.08	19.38	4.06	37	74	46	62775	0.039811	0.094483	0.865706	0.449838	-0.002384
7	1	100.66	21.16	24.28	4.15	26	69	37	28809	0.045664	0.085841	0.868496	0.434448	-0.003804
8	1	50.32	10.91	13.31	3.78	33	53	22	26800	0.002528	0.086703	0.910769	0.402746	-0.004073
9	1	83.64	18.06	20.42	4.10	19	36	21	7448	0.025133	0.061502	0.913365	0.444903	-0.007596
11	1	108.08	21.29	25.04	4.32	28	64	41	15372	0.005491	0.021223	0.973286	0.363366	-0.005909
12	1	55.07	11.42	14.28	3.86	31	45	36	37852	0.006527	0.101500	0.891973	0.378242	-0.002677
13	1	55.43	11.27	13.04	4.25	23	51	32	19667	0.002848	0.119022	0.878130	0.389047	-0.005003
14	1	47.81	10.12	11.63	4.11	24	44	22	13350	0.008105	0.108889	0.883007	0.395945	-0.007379
15	1	54.25	11.20	13.03	4.16	19	35	23	13986	0.027626	0.132134	0.840240	0.402049	-0.004922
16	1	49.55	10.02	11.62	4.27	31	54	30	20512	0.024403	0.071559	0.904038	0.380537	-0.004479
17	1	49.90	10.30	12.96	3.85	19	38	23	17698	0.002038	0.131960	0.866002	0.390458	-0.005908
19	1	48.47	10.24	11.75	4.12	19	34	13	14747	0.001867	0.138056	0.860077	0.409527	-0.006761
21	1	54.84	11.39	13.03	4.21	21	59	31	18827	0.008961	0.132456	0.858583	0.418912	-0.006739
22	1	47.45	9.88	12.47	3.80	21	29	19	14796	0.002936	0.139375	0.857690	0.381034	-0.005744
23	1	83.58	18.14	20.68	4.04	36	68	39	37032	0.042359	0.096669	0.860972	0.445314	-0.003014
24	1	56.23	11.70	14.50	3.88	47	68	45	25212	0.009069	0.088477	0.902454	0.378405	-0.004760
25	1	62.53	12.76	14.25	4.39	34	47	32	32662	0.000765	0.083484	0.915752	0.435393	-0.003651
26	1	56.91	11.51	13.38	4.25	17	33	21	13110	0.030593	0.130057	0.839350	0.390497	-0.004099
27	1	80.83	17.08	20.34	3.97	24	46	30	35965	0.071162	0.087031	0.841807	0.424724	-0.002683
28	1	56.31	11.66	14.59	3.86	33	63	36	23725	0.017976	0.075566	0.906458	0.371894	-0.005111
30	1	63.32	13.42	16.34	3.88	33	57	37	20774	0.005794	0.082254	0.911952	0.393344	-0.005641
31	1	59.90	12.70	15.41	3.89	22	33	18	19612	0.012434	0.100391	0.887175	0.382525	-0.003405
32	1	88.37	18.47	20.95	4.22	26	51	32	19529	0.008701	0.140993	0.850306	0.421528	-0.006533
33	1	84.31	17.21	20.60	4.09	45	75	40	88656	0.093831	0.083238	0.822931	0.413722	-0.001785
34	1	66.21	12.53	15.12	4.38	29	49	47	22661	0.029045	0.088927	0.882028	0.358528	-0.004122
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
816	0	105.78	20.92	24.09	4.39	20	45	32	21616	0.011321	0.057714	0.930966	0.405790	-0.005171
818	0	72.88	15.26	17.51	4.16	21	44	33	16644	0.003178	0.121469	0.875353	0.407512	-0.003924
819	0	51.25	11.19	12.77	4.01	27	29	16	15698	0.000698	0.079407	0.919895	0.401947	-0.006857
822	0	50.31	10.84	12.52	4.02	23	32	11	20876	0.001383	0.109286	0.889331	0.395386	-0.005711
826	0	66.02	13.64	15.36	4.30	34	41	24	13239	0.006555	0.116445	0.877000	0.414128	-0.004404
828	0	67.67	14.96	17.01	3.98	17	31	20	12801	0.032057	0.145448	0.822495	0.405323	-0.005043
832	0	62.73	13.17	17.02	3.69	23	40	23	17942	0.004077	0.112054	0.883870	0.363192	-0.004725
834	0	67.65	14.30	16.10	4.20	27	39	24	13029	0.001299	0.097178	0.901523	0.417177	-0.005183
836	0	63.27	13.23	15.10	4.19	23	47	25	15349	0.000767	0.110376	0.888857	0.407069	-0.004114
839	0	59.67	12.45	14.24	4.19	19	47	28	13891	0.003133	0.134724	0.862143	0.404630	-0.004949
844	0	55.33	11.45	13.35	4.14	28	35	15	13309	0.003425	0.126202	0.870373	0.397424	-0.006587
850	0	66.13	13.41	15.74	4.20	23	46	36	11411	0.005469	0.101492	0.893039	0.411550	-0.007386
854	0	101.99	21.96	24.94	4.09	26	40	29	15100	0.000913	0.112962	0.886125	0.419969	-0.006389
855	0	73.59	15.47	17.77	4.14	32	58	35	15917	0.007117	0.105749	0.887134	0.411086	-0.006587
860	0	82.29	16.58	18.98	4.33	23	65	43	18326	0.034692	0.056580	0.908728	0.411588	-0.004099
864	0	46.52	9.51	11.18	4.16	21	39	34	13860	0.001450	0.099573	0.898978	0.385226	-0.005997
866	0	65.69	13.94	16.02	4.10	14	19	21	31326	0.013528	0.101190	0.885281	0.433487	-0.000601
868	0	68.44	14.50	16.66	4.11	12	18	21	18371	0.010949	0.079423	0.909628	0.438282	-0.005570
870	0	70.24	14.84	17.03	4.13	13	20	19	15384	0.008680	0.073779	0.917541	0.443674	-0.004892
872	0	69.87	15.12	17.13	4.08	14	19	22	23359	0.011550	0.077670	0.910780	0.462138	-0.004629
874	0	69.98	15.07	17.05	4.10	12	26	24	5759	0.009580	0.075818	0.914602	0.460665	0.000059
878	0	63.52	12.84	15.76	4.03	27	41	18	17251	0.019949	0.079616	0.900435	0.361699	-0.004511
879	0	68.38	14.30	16.28	4.20	25	53	34	8067	0.015775	0.081354	0.902871	0.427792	-0.003822
881	0	80.02	17.41	19.77	4.05	28	45	30	15554	0.000974	0.103408	0.895618	0.426759	-0.005783
894	0	64.10	14.09	16.24	3.95	23	32	17	12164	0.008230	0.115984	0.875785	0.408528	-0.005740
897	0	56.35	11.20	12.88	4.38	15	10	4	12478	0.015719	0.093549	0.890732	0.049587	-0.004804
902	0	57.97	11.90	13.85	4.19	31	38	32	11962	0.007213	0.080013	0.912773	0.399001	-0.006421
903	0	84.74	17.61	21.04	4.03	28	63	32	3833	0.008932	0.176357	0.814711	0.357600	-0.002973
906	0	99.52	20.60	24.56	4.05	14	9	9	45029	0.070019	0.012519	0.917462	0.248194	-0.003369
908	0	65.26	13.98	16.27	4.01	26	31	18	33519	0.002062	0.084654	0.913284	0.379555	-0.003820

	Label	Avg_sen_len	Blank	Avg_num_word	Avg_word_len	num_of_weapon	num_of_bloody	num_of_mysterious	num_of_PopWord	Exclamation_Ratio	Ques_Mark_Ratio	Period_Ratio	Stopwords_Ratio	Avg_sentiment
count	200.000000	200.000000	200.000000	200.000000	200.000000	200.000000	200.000000	200.000000	200.000000	200.000000	200.000000	200.000000	200.000000	200.000000
mean	0.500000	64.510850	13.480000	15.734350	4.098200	26.610000	43.670000	26.240000	21469.220000	0.016519	0.105939	0.877542	0.394061	-0.004905
std	0.501255	13.181462	2.745602	3.065132	0.165033	7.931104	14.264941	9.525021	13997.464919	0.020156	0.030764	0.036451	0.054025	0.001592
min	0.000000	43.770000	8.670000	10.810000	3.660000	6.000000	5.000000	1.000000	3240.000000	0.000000	0.002588	0.753951	0.049587	-0.013573
25%	0.000000	55.367500	11.562500	13.535000	4.010000	21.750000	34.000000	20.750000	14303.500000	0.003989	0.083422	0.854945	0.383138	-0.005741
50%	0.500000	61.850000	12.855000	15.110000	4.100000	26.000000	44.000000	25.000000	18220.500000	0.008691	0.106156	0.881681	0.398482	-0.004863
75%	1.000000	69.995000	14.817500	17.130000	4.190000	32.000000	52.000000	32.000000	24242.250000	0.021320	0.125323	0.902412	0.416372	-0.004017
max	1.000000	128.800000	25.760000	27.960000	4.610000	52.000000	92.000000	55.000000	107384.000000	0.127455	0.194009	0.994177	0.462138	0.000059

	Label	Avg_sen_len	Blank	Avg_num_word	Avg_word_len	num_of_weapon	num_of_bloody	num_of_mysterious	num_of_PopWord	Exclamation_Ratio	Ques_Mark_Ratio	Period_Ratio	Stopwords_Ratio	Avg_sentiment
Label	1.000000	-0.011724	-0.008069	0.017057	-0.165229	0.144099	0.080820	0.081043	0.025593	0.088009	-0.134481	0.064837	-0.020507	-0.048014
Avg_sen_len	-0.011724	1.000000	0.984967	0.979242	0.251266	-0.031755	0.166153	0.265984	0.130755	0.247384	-0.320978	0.134113	-0.019227	0.142893
Blank	-0.008069	0.984967	1.000000	0.985378	0.152948	-0.052497	0.128518	0.231573	0.115210	0.201513	-0.314627	0.154117	0.048923	0.150920
Avg_num_word	0.017057	0.979242	0.985378	1.000000	0.055390	-0.053303	0.137360	0.229812	0.116027	0.248042	-0.281382	0.100331	0.007334	0.155410
Avg_word_len	-0.165229	0.251266	0.152948	0.055390	1.000000	0.115597	0.187885	0.255186	0.115421	0.083783	-0.216982	0.136805	-0.123713	0.002357
num_of_weapon	0.144099	-0.031755	-0.052497	-0.053303	0.115597	1.000000	0.674252	0.537257	0.193223	-0.031040	0.099642	-0.066934	0.190961	0.026211
num_of_bloody	0.080820	0.166153	0.128518	0.137360	0.187885	0.674252	1.000000	0.797364	0.236442	0.110291	0.054063	-0.106615	0.287134	0.076268
num_of_mysterious	0.081043	0.265984	0.231573	0.229812	0.255186	0.537257	0.797364	1.000000	0.223923	0.082695	-0.090107	0.030324	0.331356	0.129078
num_of_PopWord	0.025593	0.130755	0.115210	0.116027	0.115421	0.193223	0.236442	0.223923	1.000000	0.065132	-0.089374	0.039416	0.072630	0.531727
Exclamation_Ratio	0.088009	0.247384	0.201513	0.248042	0.083783	-0.031040	0.110291	0.082695	0.065132	1.000000	-0.019385	-0.536594	-0.199896	0.088052
Ques_Mark_Ratio	-0.134481	-0.320978	-0.314627	-0.281382	-0.216982	0.099642	0.054063	-0.090107	-0.089374	-0.019385	1.000000	-0.833280	0.073163	-0.043373
Period_Ratio	0.064837	0.134113	0.154117	0.100331	0.136805	-0.066934	-0.106615	0.030324	0.039416	-0.536594	-0.833280	1.000000	0.048785	-0.012082
Stopwords_Ratio	-0.020507	-0.019227	0.048923	0.007334	-0.123713	0.190961	0.287134	0.331356	0.072630	-0.199896	0.073163	0.048785	1.000000	0.037057
Avg_sentiment	-0.048014	0.142893	0.150920	0.155410	0.002357	0.026211	0.076268	0.129078	0.531727	0.088052	-0.043373	-0.012082	0.037057	1.000000