In [1]:
##############################
#Author Skye Ouyang
#Date: 24th Apr
##############################
import pandas as pd
import numpy as np
from pandas import DataFrame
import sklearn
import requests
import StringIO
import json
import re

In [2]:
base_path='./'
whole_data=DataFrame.from_csv(base_path+'Text Analytics.csv',index_col='BOOK ID')
whole_data


Out[2]:
Label Avg_sen_len Blank Avg_num_word Avg_word_len num_of_weapon num_of_bloody num_of_mysterious num_of_PopWord Exclamation_Ratio Ques_Mark_Ratio Period_Ratio Stopwords_Ratio Avg_sentiment
BOOK ID
1 1 66.21 13.50 15.13 4.38 34 61 35 30747 0.001428 0.070812 0.927761 0.431888 -0.004217
2 1 53.26 11.76 13.30 4.00 21 41 18 14681 0.004876 0.082004 0.913121 0.409934 -0.005893
3 1 60.19 12.39 15.62 3.85 17 34 15 14669 0.021185 0.127748 0.851067 0.376801 -0.005749
4 1 52.72 10.56 12.32 4.28 18 31 19 10815 0.037219 0.131997 0.830785 0.403794 -0.003993
5 1 82.84 19.73 21.67 3.82 18 16 11 13146 0.001936 0.077184 0.920881 0.445443 -0.003967
6 1 78.68 17.08 19.38 4.06 37 74 46 62775 0.039811 0.094483 0.865706 0.449838 -0.002384
7 1 100.66 21.16 24.28 4.15 26 69 37 28809 0.045664 0.085841 0.868496 0.434448 -0.003804
8 1 50.32 10.91 13.31 3.78 33 53 22 26800 0.002528 0.086703 0.910769 0.402746 -0.004073
9 1 83.64 18.06 20.42 4.10 19 36 21 7448 0.025133 0.061502 0.913365 0.444903 -0.007596
11 1 108.08 21.29 25.04 4.32 28 64 41 15372 0.005491 0.021223 0.973286 0.363366 -0.005909
12 1 55.07 11.42 14.28 3.86 31 45 36 37852 0.006527 0.101500 0.891973 0.378242 -0.002677
13 1 55.43 11.27 13.04 4.25 23 51 32 19667 0.002848 0.119022 0.878130 0.389047 -0.005003
14 1 47.81 10.12 11.63 4.11 24 44 22 13350 0.008105 0.108889 0.883007 0.395945 -0.007379
15 1 54.25 11.20 13.03 4.16 19 35 23 13986 0.027626 0.132134 0.840240 0.402049 -0.004922
16 1 49.55 10.02 11.62 4.27 31 54 30 20512 0.024403 0.071559 0.904038 0.380537 -0.004479
17 1 49.90 10.30 12.96 3.85 19 38 23 17698 0.002038 0.131960 0.866002 0.390458 -0.005908
19 1 48.47 10.24 11.75 4.12 19 34 13 14747 0.001867 0.138056 0.860077 0.409527 -0.006761
21 1 54.84 11.39 13.03 4.21 21 59 31 18827 0.008961 0.132456 0.858583 0.418912 -0.006739
22 1 47.45 9.88 12.47 3.80 21 29 19 14796 0.002936 0.139375 0.857690 0.381034 -0.005744
23 1 83.58 18.14 20.68 4.04 36 68 39 37032 0.042359 0.096669 0.860972 0.445314 -0.003014
24 1 56.23 11.70 14.50 3.88 47 68 45 25212 0.009069 0.088477 0.902454 0.378405 -0.004760
25 1 62.53 12.76 14.25 4.39 34 47 32 32662 0.000765 0.083484 0.915752 0.435393 -0.003651
26 1 56.91 11.51 13.38 4.25 17 33 21 13110 0.030593 0.130057 0.839350 0.390497 -0.004099
27 1 80.83 17.08 20.34 3.97 24 46 30 35965 0.071162 0.087031 0.841807 0.424724 -0.002683
28 1 56.31 11.66 14.59 3.86 33 63 36 23725 0.017976 0.075566 0.906458 0.371894 -0.005111
30 1 63.32 13.42 16.34 3.88 33 57 37 20774 0.005794 0.082254 0.911952 0.393344 -0.005641
31 1 59.90 12.70 15.41 3.89 22 33 18 19612 0.012434 0.100391 0.887175 0.382525 -0.003405
32 1 88.37 18.47 20.95 4.22 26 51 32 19529 0.008701 0.140993 0.850306 0.421528 -0.006533
33 1 84.31 17.21 20.60 4.09 45 75 40 88656 0.093831 0.083238 0.822931 0.413722 -0.001785
34 1 66.21 12.53 15.12 4.38 29 49 47 22661 0.029045 0.088927 0.882028 0.358528 -0.004122
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
816 0 105.78 20.92 24.09 4.39 20 45 32 21616 0.011321 0.057714 0.930966 0.405790 -0.005171
818 0 72.88 15.26 17.51 4.16 21 44 33 16644 0.003178 0.121469 0.875353 0.407512 -0.003924
819 0 51.25 11.19 12.77 4.01 27 29 16 15698 0.000698 0.079407 0.919895 0.401947 -0.006857
822 0 50.31 10.84 12.52 4.02 23 32 11 20876 0.001383 0.109286 0.889331 0.395386 -0.005711
826 0 66.02 13.64 15.36 4.30 34 41 24 13239 0.006555 0.116445 0.877000 0.414128 -0.004404
828 0 67.67 14.96 17.01 3.98 17 31 20 12801 0.032057 0.145448 0.822495 0.405323 -0.005043
832 0 62.73 13.17 17.02 3.69 23 40 23 17942 0.004077 0.112054 0.883870 0.363192 -0.004725
834 0 67.65 14.30 16.10 4.20 27 39 24 13029 0.001299 0.097178 0.901523 0.417177 -0.005183
836 0 63.27 13.23 15.10 4.19 23 47 25 15349 0.000767 0.110376 0.888857 0.407069 -0.004114
839 0 59.67 12.45 14.24 4.19 19 47 28 13891 0.003133 0.134724 0.862143 0.404630 -0.004949
844 0 55.33 11.45 13.35 4.14 28 35 15 13309 0.003425 0.126202 0.870373 0.397424 -0.006587
850 0 66.13 13.41 15.74 4.20 23 46 36 11411 0.005469 0.101492 0.893039 0.411550 -0.007386
854 0 101.99 21.96 24.94 4.09 26 40 29 15100 0.000913 0.112962 0.886125 0.419969 -0.006389
855 0 73.59 15.47 17.77 4.14 32 58 35 15917 0.007117 0.105749 0.887134 0.411086 -0.006587
860 0 82.29 16.58 18.98 4.33 23 65 43 18326 0.034692 0.056580 0.908728 0.411588 -0.004099
864 0 46.52 9.51 11.18 4.16 21 39 34 13860 0.001450 0.099573 0.898978 0.385226 -0.005997
866 0 65.69 13.94 16.02 4.10 14 19 21 31326 0.013528 0.101190 0.885281 0.433487 -0.000601
868 0 68.44 14.50 16.66 4.11 12 18 21 18371 0.010949 0.079423 0.909628 0.438282 -0.005570
870 0 70.24 14.84 17.03 4.13 13 20 19 15384 0.008680 0.073779 0.917541 0.443674 -0.004892
872 0 69.87 15.12 17.13 4.08 14 19 22 23359 0.011550 0.077670 0.910780 0.462138 -0.004629
874 0 69.98 15.07 17.05 4.10 12 26 24 5759 0.009580 0.075818 0.914602 0.460665 0.000059
878 0 63.52 12.84 15.76 4.03 27 41 18 17251 0.019949 0.079616 0.900435 0.361699 -0.004511
879 0 68.38 14.30 16.28 4.20 25 53 34 8067 0.015775 0.081354 0.902871 0.427792 -0.003822
881 0 80.02 17.41 19.77 4.05 28 45 30 15554 0.000974 0.103408 0.895618 0.426759 -0.005783
894 0 64.10 14.09 16.24 3.95 23 32 17 12164 0.008230 0.115984 0.875785 0.408528 -0.005740
897 0 56.35 11.20 12.88 4.38 15 10 4 12478 0.015719 0.093549 0.890732 0.049587 -0.004804
902 0 57.97 11.90 13.85 4.19 31 38 32 11962 0.007213 0.080013 0.912773 0.399001 -0.006421
903 0 84.74 17.61 21.04 4.03 28 63 32 3833 0.008932 0.176357 0.814711 0.357600 -0.002973
906 0 99.52 20.60 24.56 4.05 14 9 9 45029 0.070019 0.012519 0.917462 0.248194 -0.003369
908 0 65.26 13.98 16.27 4.01 26 31 18 33519 0.002062 0.084654 0.913284 0.379555 -0.003820

200 rows × 14 columns


In [3]:
whole_data.head()


Out[3]:
Label Avg_sen_len Blank Avg_num_word Avg_word_len num_of_weapon num_of_bloody num_of_mysterious num_of_PopWord Exclamation_Ratio Ques_Mark_Ratio Period_Ratio Stopwords_Ratio Avg_sentiment
BOOK ID
1 1 66.21 13.50 15.13 4.38 34 61 35 30747 0.001428 0.070812 0.927761 0.431888 -0.004217
2 1 53.26 11.76 13.30 4.00 21 41 18 14681 0.004876 0.082004 0.913121 0.409934 -0.005893
3 1 60.19 12.39 15.62 3.85 17 34 15 14669 0.021185 0.127748 0.851067 0.376801 -0.005749
4 1 52.72 10.56 12.32 4.28 18 31 19 10815 0.037219 0.131997 0.830785 0.403794 -0.003993
5 1 82.84 19.73 21.67 3.82 18 16 11 13146 0.001936 0.077184 0.920881 0.445443 -0.003967

In [4]:
whole_data.describe()


Out[4]:
Label Avg_sen_len Blank Avg_num_word Avg_word_len num_of_weapon num_of_bloody num_of_mysterious num_of_PopWord Exclamation_Ratio Ques_Mark_Ratio Period_Ratio Stopwords_Ratio Avg_sentiment
count 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000
mean 0.500000 64.510850 13.480000 15.734350 4.098200 26.610000 43.670000 26.240000 21469.220000 0.016519 0.105939 0.877542 0.394061 -0.004905
std 0.501255 13.181462 2.745602 3.065132 0.165033 7.931104 14.264941 9.525021 13997.464919 0.020156 0.030764 0.036451 0.054025 0.001592
min 0.000000 43.770000 8.670000 10.810000 3.660000 6.000000 5.000000 1.000000 3240.000000 0.000000 0.002588 0.753951 0.049587 -0.013573
25% 0.000000 55.367500 11.562500 13.535000 4.010000 21.750000 34.000000 20.750000 14303.500000 0.003989 0.083422 0.854945 0.383138 -0.005741
50% 0.500000 61.850000 12.855000 15.110000 4.100000 26.000000 44.000000 25.000000 18220.500000 0.008691 0.106156 0.881681 0.398482 -0.004863
75% 1.000000 69.995000 14.817500 17.130000 4.190000 32.000000 52.000000 32.000000 24242.250000 0.021320 0.125323 0.902412 0.416372 -0.004017
max 1.000000 128.800000 25.760000 27.960000 4.610000 52.000000 92.000000 55.000000 107384.000000 0.127455 0.194009 0.994177 0.462138 0.000059

In [6]:
Whole_data=(whole_data-whole_data.min())/(whole_data.max()-whole_data.min())
whole_data


Out[6]:
Label Avg_sen_len Blank Avg_num_word Avg_word_len num_of_weapon num_of_bloody num_of_mysterious num_of_PopWord Exclamation_Ratio Ques_Mark_Ratio Period_Ratio Stopwords_Ratio Avg_sentiment
BOOK ID
1 1 66.21 13.50 15.13 4.38 34 61 35 30747 0.001428 0.070812 0.927761 0.431888 -0.004217
2 1 53.26 11.76 13.30 4.00 21 41 18 14681 0.004876 0.082004 0.913121 0.409934 -0.005893
3 1 60.19 12.39 15.62 3.85 17 34 15 14669 0.021185 0.127748 0.851067 0.376801 -0.005749
4 1 52.72 10.56 12.32 4.28 18 31 19 10815 0.037219 0.131997 0.830785 0.403794 -0.003993
5 1 82.84 19.73 21.67 3.82 18 16 11 13146 0.001936 0.077184 0.920881 0.445443 -0.003967
6 1 78.68 17.08 19.38 4.06 37 74 46 62775 0.039811 0.094483 0.865706 0.449838 -0.002384
7 1 100.66 21.16 24.28 4.15 26 69 37 28809 0.045664 0.085841 0.868496 0.434448 -0.003804
8 1 50.32 10.91 13.31 3.78 33 53 22 26800 0.002528 0.086703 0.910769 0.402746 -0.004073
9 1 83.64 18.06 20.42 4.10 19 36 21 7448 0.025133 0.061502 0.913365 0.444903 -0.007596
11 1 108.08 21.29 25.04 4.32 28 64 41 15372 0.005491 0.021223 0.973286 0.363366 -0.005909
12 1 55.07 11.42 14.28 3.86 31 45 36 37852 0.006527 0.101500 0.891973 0.378242 -0.002677
13 1 55.43 11.27 13.04 4.25 23 51 32 19667 0.002848 0.119022 0.878130 0.389047 -0.005003
14 1 47.81 10.12 11.63 4.11 24 44 22 13350 0.008105 0.108889 0.883007 0.395945 -0.007379
15 1 54.25 11.20 13.03 4.16 19 35 23 13986 0.027626 0.132134 0.840240 0.402049 -0.004922
16 1 49.55 10.02 11.62 4.27 31 54 30 20512 0.024403 0.071559 0.904038 0.380537 -0.004479
17 1 49.90 10.30 12.96 3.85 19 38 23 17698 0.002038 0.131960 0.866002 0.390458 -0.005908
19 1 48.47 10.24 11.75 4.12 19 34 13 14747 0.001867 0.138056 0.860077 0.409527 -0.006761
21 1 54.84 11.39 13.03 4.21 21 59 31 18827 0.008961 0.132456 0.858583 0.418912 -0.006739
22 1 47.45 9.88 12.47 3.80 21 29 19 14796 0.002936 0.139375 0.857690 0.381034 -0.005744
23 1 83.58 18.14 20.68 4.04 36 68 39 37032 0.042359 0.096669 0.860972 0.445314 -0.003014
24 1 56.23 11.70 14.50 3.88 47 68 45 25212 0.009069 0.088477 0.902454 0.378405 -0.004760
25 1 62.53 12.76 14.25 4.39 34 47 32 32662 0.000765 0.083484 0.915752 0.435393 -0.003651
26 1 56.91 11.51 13.38 4.25 17 33 21 13110 0.030593 0.130057 0.839350 0.390497 -0.004099
27 1 80.83 17.08 20.34 3.97 24 46 30 35965 0.071162 0.087031 0.841807 0.424724 -0.002683
28 1 56.31 11.66 14.59 3.86 33 63 36 23725 0.017976 0.075566 0.906458 0.371894 -0.005111
30 1 63.32 13.42 16.34 3.88 33 57 37 20774 0.005794 0.082254 0.911952 0.393344 -0.005641
31 1 59.90 12.70 15.41 3.89 22 33 18 19612 0.012434 0.100391 0.887175 0.382525 -0.003405
32 1 88.37 18.47 20.95 4.22 26 51 32 19529 0.008701 0.140993 0.850306 0.421528 -0.006533
33 1 84.31 17.21 20.60 4.09 45 75 40 88656 0.093831 0.083238 0.822931 0.413722 -0.001785
34 1 66.21 12.53 15.12 4.38 29 49 47 22661 0.029045 0.088927 0.882028 0.358528 -0.004122
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
816 0 105.78 20.92 24.09 4.39 20 45 32 21616 0.011321 0.057714 0.930966 0.405790 -0.005171
818 0 72.88 15.26 17.51 4.16 21 44 33 16644 0.003178 0.121469 0.875353 0.407512 -0.003924
819 0 51.25 11.19 12.77 4.01 27 29 16 15698 0.000698 0.079407 0.919895 0.401947 -0.006857
822 0 50.31 10.84 12.52 4.02 23 32 11 20876 0.001383 0.109286 0.889331 0.395386 -0.005711
826 0 66.02 13.64 15.36 4.30 34 41 24 13239 0.006555 0.116445 0.877000 0.414128 -0.004404
828 0 67.67 14.96 17.01 3.98 17 31 20 12801 0.032057 0.145448 0.822495 0.405323 -0.005043
832 0 62.73 13.17 17.02 3.69 23 40 23 17942 0.004077 0.112054 0.883870 0.363192 -0.004725
834 0 67.65 14.30 16.10 4.20 27 39 24 13029 0.001299 0.097178 0.901523 0.417177 -0.005183
836 0 63.27 13.23 15.10 4.19 23 47 25 15349 0.000767 0.110376 0.888857 0.407069 -0.004114
839 0 59.67 12.45 14.24 4.19 19 47 28 13891 0.003133 0.134724 0.862143 0.404630 -0.004949
844 0 55.33 11.45 13.35 4.14 28 35 15 13309 0.003425 0.126202 0.870373 0.397424 -0.006587
850 0 66.13 13.41 15.74 4.20 23 46 36 11411 0.005469 0.101492 0.893039 0.411550 -0.007386
854 0 101.99 21.96 24.94 4.09 26 40 29 15100 0.000913 0.112962 0.886125 0.419969 -0.006389
855 0 73.59 15.47 17.77 4.14 32 58 35 15917 0.007117 0.105749 0.887134 0.411086 -0.006587
860 0 82.29 16.58 18.98 4.33 23 65 43 18326 0.034692 0.056580 0.908728 0.411588 -0.004099
864 0 46.52 9.51 11.18 4.16 21 39 34 13860 0.001450 0.099573 0.898978 0.385226 -0.005997
866 0 65.69 13.94 16.02 4.10 14 19 21 31326 0.013528 0.101190 0.885281 0.433487 -0.000601
868 0 68.44 14.50 16.66 4.11 12 18 21 18371 0.010949 0.079423 0.909628 0.438282 -0.005570
870 0 70.24 14.84 17.03 4.13 13 20 19 15384 0.008680 0.073779 0.917541 0.443674 -0.004892
872 0 69.87 15.12 17.13 4.08 14 19 22 23359 0.011550 0.077670 0.910780 0.462138 -0.004629
874 0 69.98 15.07 17.05 4.10 12 26 24 5759 0.009580 0.075818 0.914602 0.460665 0.000059
878 0 63.52 12.84 15.76 4.03 27 41 18 17251 0.019949 0.079616 0.900435 0.361699 -0.004511
879 0 68.38 14.30 16.28 4.20 25 53 34 8067 0.015775 0.081354 0.902871 0.427792 -0.003822
881 0 80.02 17.41 19.77 4.05 28 45 30 15554 0.000974 0.103408 0.895618 0.426759 -0.005783
894 0 64.10 14.09 16.24 3.95 23 32 17 12164 0.008230 0.115984 0.875785 0.408528 -0.005740
897 0 56.35 11.20 12.88 4.38 15 10 4 12478 0.015719 0.093549 0.890732 0.049587 -0.004804
902 0 57.97 11.90 13.85 4.19 31 38 32 11962 0.007213 0.080013 0.912773 0.399001 -0.006421
903 0 84.74 17.61 21.04 4.03 28 63 32 3833 0.008932 0.176357 0.814711 0.357600 -0.002973
906 0 99.52 20.60 24.56 4.05 14 9 9 45029 0.070019 0.012519 0.917462 0.248194 -0.003369
908 0 65.26 13.98 16.27 4.01 26 31 18 33519 0.002062 0.084654 0.913284 0.379555 -0.003820

200 rows × 14 columns


In [7]:
Whole_data.describe()


Out[7]:
Label Avg_sen_len Blank Avg_num_word Avg_word_len num_of_weapon num_of_bloody num_of_mysterious num_of_PopWord Exclamation_Ratio Ques_Mark_Ratio Period_Ratio Stopwords_Ratio Avg_sentiment
count 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000
mean 0.500000 0.243924 0.281451 0.287134 0.461263 0.448043 0.444483 0.467407 0.175039 0.129606 0.539917 0.514478 0.834986 0.635879
std 0.501255 0.155021 0.160655 0.178725 0.173719 0.172415 0.163965 0.176389 0.134405 0.158138 0.160715 0.151734 0.130953 0.116756
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.136393 0.169251 0.158892 0.368421 0.342391 0.333333 0.365741 0.106233 0.031299 0.422285 0.420415 0.808510 0.574520
50% 0.500000 0.212631 0.244880 0.250729 0.463158 0.434783 0.448276 0.444444 0.143844 0.068185 0.541049 0.531708 0.845703 0.638899
75% 1.000000 0.308421 0.359713 0.368513 0.557895 0.565217 0.540230 0.574074 0.201665 0.167277 0.641179 0.618006 0.889067 0.701004
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

In [8]:
corrmat = Whole_data.corr()
corrmat


Out[8]:
Label Avg_sen_len Blank Avg_num_word Avg_word_len num_of_weapon num_of_bloody num_of_mysterious num_of_PopWord Exclamation_Ratio Ques_Mark_Ratio Period_Ratio Stopwords_Ratio Avg_sentiment
Label 1.000000 -0.011724 -0.008069 0.017057 -0.165229 0.144099 0.080820 0.081043 0.025593 0.088009 -0.134481 0.064837 -0.020507 -0.048014
Avg_sen_len -0.011724 1.000000 0.984967 0.979242 0.251266 -0.031755 0.166153 0.265984 0.130755 0.247384 -0.320978 0.134113 -0.019227 0.142893
Blank -0.008069 0.984967 1.000000 0.985378 0.152948 -0.052497 0.128518 0.231573 0.115210 0.201513 -0.314627 0.154117 0.048923 0.150920
Avg_num_word 0.017057 0.979242 0.985378 1.000000 0.055390 -0.053303 0.137360 0.229812 0.116027 0.248042 -0.281382 0.100331 0.007334 0.155410
Avg_word_len -0.165229 0.251266 0.152948 0.055390 1.000000 0.115597 0.187885 0.255186 0.115421 0.083783 -0.216982 0.136805 -0.123713 0.002357
num_of_weapon 0.144099 -0.031755 -0.052497 -0.053303 0.115597 1.000000 0.674252 0.537257 0.193223 -0.031040 0.099642 -0.066934 0.190961 0.026211
num_of_bloody 0.080820 0.166153 0.128518 0.137360 0.187885 0.674252 1.000000 0.797364 0.236442 0.110291 0.054063 -0.106615 0.287134 0.076268
num_of_mysterious 0.081043 0.265984 0.231573 0.229812 0.255186 0.537257 0.797364 1.000000 0.223923 0.082695 -0.090107 0.030324 0.331356 0.129078
num_of_PopWord 0.025593 0.130755 0.115210 0.116027 0.115421 0.193223 0.236442 0.223923 1.000000 0.065132 -0.089374 0.039416 0.072630 0.531727
Exclamation_Ratio 0.088009 0.247384 0.201513 0.248042 0.083783 -0.031040 0.110291 0.082695 0.065132 1.000000 -0.019385 -0.536594 -0.199896 0.088052
Ques_Mark_Ratio -0.134481 -0.320978 -0.314627 -0.281382 -0.216982 0.099642 0.054063 -0.090107 -0.089374 -0.019385 1.000000 -0.833280 0.073163 -0.043373
Period_Ratio 0.064837 0.134113 0.154117 0.100331 0.136805 -0.066934 -0.106615 0.030324 0.039416 -0.536594 -0.833280 1.000000 0.048785 -0.012082
Stopwords_Ratio -0.020507 -0.019227 0.048923 0.007334 -0.123713 0.190961 0.287134 0.331356 0.072630 -0.199896 0.073163 0.048785 1.000000 0.037057
Avg_sentiment -0.048014 0.142893 0.150920 0.155410 0.002357 0.026211 0.076268 0.129078 0.531727 0.088052 -0.043373 -0.012082 0.037057 1.000000

In [9]:
import seaborn as sns
import matplotlib.pyplot as plt
# set up the matplotlib figure
f, ax=plt.subplots(figsize=(12,9))
# draw the heatmap using seaborn
sns.heatmap(corrmat,vmax=.8,square=True)
plt.show()



In [10]:
c = Whole_data.corr().abs()
s = c.unstack()
so = s.order(kind='quicksort')
so[so > 0.9 ]


C:\Users\Skyeo\AppData\Local\Enthought\Canopy\User\lib\site-packages\ipykernel\__main__.py:3: FutureWarning: order is deprecated, use sort_values(...)
  app.launch_new_instance()
Out[10]:
Avg_sen_len        Avg_num_word         0.979242
Avg_num_word       Avg_sen_len          0.979242
Avg_sen_len        Blank                0.984967
Blank              Avg_sen_len          0.984967
Avg_num_word       Blank                0.985378
Blank              Avg_num_word         0.985378
Label              Label                1.000000
Period_Ratio       Period_Ratio         1.000000
Ques_Mark_Ratio    Ques_Mark_Ratio      1.000000
Exclamation_Ratio  Exclamation_Ratio    1.000000
num_of_PopWord     num_of_PopWord       1.000000
num_of_mysterious  num_of_mysterious    1.000000
num_of_bloody      num_of_bloody        1.000000
num_of_weapon      num_of_weapon        1.000000
Avg_word_len       Avg_word_len         1.000000
Avg_num_word       Avg_num_word         1.000000
Blank              Blank                1.000000
Avg_sen_len        Avg_sen_len          1.000000
Stopwords_Ratio    Stopwords_Ratio      1.000000
Avg_sentiment      Avg_sentiment        1.000000
dtype: float64

separate independent and dependent variable for regression


In [11]:
data_y = Whole_data['Label']
data_X = Whole_data.ix[:,Whole_data.columns.difference(['Label','Avg_sen_len','Blank'])]
print data_y.shape
print data_X.shape
print data_y.dtype
print data_X.dtypes


(200L,)
(200, 11)
float64
Avg_num_word         float64
Avg_sentiment        float64
Avg_word_len         float64
Exclamation_Ratio    float64
Period_Ratio         float64
Ques_Mark_Ratio      float64
Stopwords_Ratio      float64
num_of_PopWord       float64
num_of_bloody        float64
num_of_mysterious    float64
num_of_weapon        float64
dtype: object

In [12]:
data_X.describe()


Out[12]:
Avg_num_word Avg_sentiment Avg_word_len Exclamation_Ratio Period_Ratio Ques_Mark_Ratio Stopwords_Ratio num_of_PopWord num_of_bloody num_of_mysterious num_of_weapon
count 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000
mean 0.287134 0.635879 0.461263 0.129606 0.514478 0.539917 0.834986 0.175039 0.444483 0.467407 0.448043
std 0.178725 0.116756 0.173719 0.158138 0.151734 0.160715 0.130953 0.134405 0.163965 0.176389 0.172415
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.158892 0.574520 0.368421 0.031299 0.420415 0.422285 0.808510 0.106233 0.333333 0.365741 0.342391
50% 0.250729 0.638899 0.463158 0.068185 0.531708 0.541049 0.845703 0.143844 0.448276 0.444444 0.434783
75% 0.368513 0.701004 0.557895 0.167277 0.618006 0.641179 0.889067 0.201665 0.540230 0.574074 0.565217
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

In [13]:
from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import auc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

In [14]:
#prepare models
X = data_X
Y = data_y
seed=5
models = []
models.append(('LR', LogisticRegression()))
models.append(('RF', RandomForestClassifier()))
models.append(('GB',GradientBoostingClassifier()))
models.append(('NN', MLPClassifier()))

In [15]:
#evaluate the models
results = []
names = []
scoring = 'accuracy'
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=seed)
	cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)


LR: 0.280000 (0.112250)
RF: 0.605000 (0.078899)
GB: 0.595000 (0.090692)
C:\Users\Skyeo\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\neural_network\multilayer_perceptron.py:563: ConvergenceWarning: Stochastic Optimizer: Maximum iterations reached and the optimization hasn't converged yet.
  % (), ConvergenceWarning)
NN: 0.210000 (0.181384)

In [16]:
fig = plt.figure()
fig.suptitle('Model Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()



In [17]:
#split training and test datasets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size = 0.3, random_state = 0)
print("X_train : " + str(X_train.shape))
print("X_test : " + str(X_test.shape))
print("y_train : " + str(y_train.shape))
print("y_test : " + str(y_test.shape))


X_train : (140, 11)
X_test : (60, 11)
y_train : (140L,)
y_test : (60L,)
C:\Users\Skyeo\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [21]:
#test the impact of tree size on prediction accuracy
trees = range(25)
accuracy = np.zeros(25)
for idx in range(len(trees)):
    classifier=RandomForestClassifier(n_estimators=idx + 1)
    classifier=classifier.fit(X_train,y_train)
    predictions=classifier.predict(X_test)
    accuracy[idx]=accuracy_score(y_test,predictions)

In [22]:
#plot the effect of tree size
plt.cla()
plt.plot(trees,accuracy)
plt.show()



In [50]:
#fit in random forest argorithm
print'Random Forest'
rf = RandomForestClassifier(n_estimators=17,min_samples_leaf=1)
print 'Fitting model'
rf_fit=rf.fit(X_train,y_train)
print 'Predcting on test set'
y_pred_rf =rf.predict(X_test)
print confusion_matrix(y_test,y_pred_rf)
print accuracy_score(y_test,y_pred_rf)
print f1_score(y_test,y_pred_rf)


Random Forest
Fitting model
Predcting on test set
[[24  7]
 [11 18]]
0.7
0.666666666667

In [51]:
#fit in GradientBoosting
print'Gradient Boosting'
gb = GradientBoostingClassifier(n_estimators=250,learning_rate=0.05, max_depth=10, max_features = 0.8, min_samples_leaf=4, random_state=0,subsample =0.9)
print 'Fitting model'
gb.fit(X_train,y_train)
print 'Predicting on test set'
y_pred_gb = gb.predict(X_test)
print confusion_matrix(y_test,y_pred_gb)
print accuracy_score(y_test,y_pred_gb)
print f1_score(y_test,y_pred_gb)


Gradient Boosting
Fitting model
Predicting on test set
[[20 11]
 [ 5 24]]
0.733333333333
0.75

In [52]:
fpr_rf, tpr_rf, _ = metrics.roc_curve(y_test, y_pred_rf)
fpr_gb,tpr_gb, _= metrics.roc_curve(y_test, y_pred_gb)
plt.plot(fpr_rf,tpr_rf)
plt.plot(fpr_gb,tpr_gb,color='orange')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.title('ROC curve')
plt.xlabel('False Positive Rate(1-Specificity)')
plt.ylabel('True Positive Rate(Sensitivity)')
plt.grid(True)
plt.plot((0.0,1.0),(0.0,1.0),color='grey',linewidth=1,linestyle='--')
plt.show()
print ('Auc for random forest is ' + str(auc(fpr_rf, tpr_rf)))
print ('Auc for Gradient Boosting is ' + str(auc(fpr_gb,tpr_gb)))


Auc for random forest is 0.69744160178
Auc for Gradient Boosting is 0.73637374861

In [53]:
# get the feature importance 
names = data_X.columns.values
print "Features sorted by their score"
print sorted(zip(map(lambda x: round(x,4),rf.feature_importances_),names),reverse=True)


Features sorted by their score
[(0.1712, 'Avg_word_len'), (0.1175, 'num_of_weapon'), (0.0985, 'Avg_num_word'), (0.0926, 'num_of_mysterious'), (0.0923, 'num_of_PopWord'), (0.0821, 'Period_Ratio'), (0.0791, 'Ques_Mark_Ratio'), (0.0751, 'Exclamation_Ratio'), (0.0677, 'num_of_bloody'), (0.0645, 'Stopwords_Ratio'), (0.0594, 'Avg_sentiment')]

In [54]:
#create parameter for plot
importances=rf.feature_importances_
indices = np.argsort(importances)[::-1]
std = np.std([tree.feature_importances_ for tree in rf.estimators_],axis=0)
print importances
print indices
print std


[ 0.09854367  0.05937063  0.1711861   0.07506371  0.08212499  0.07908636
  0.06450674  0.09231495  0.06771732  0.09260681  0.11747871]
[ 2 10  0  9  7  4  5  3  8  6  1]
[ 0.06185107  0.04254249  0.07439091  0.03337515  0.07368179  0.05610773
  0.05238061  0.06657811  0.0539612   0.04544956  0.08297826]

In [55]:
# Plot the feature importances of random forest
import matplotlib.pyplot as plt
plt.figure()
plt.title("Feature importances")
plt.bar(range(data_X.shape[1]), importances[indices],
       color="r", yerr=std[indices],align="center")
plt.xticks(range(data_X.shape[1]), indices)
plt.xlim([-1, data_X.shape[1]])
plt.show()



In [56]:
log_loss(y_test,y_pred_rf,normalize=True)


Out[56]:
10.36172620484006

In [57]:
def score(sss,yyy):
    i=0
    compare = list()
    while i< len(sss):
        if sss[i] == yyy[i]:
            compare.append(1)
        else: 
            compare.append(0)
        i+=1
    return(compare)

In [58]:
ytest=Whole_data['Label'].values

In [59]:
c=score(y_pred_rf,ytest)

In [60]:
float(sum(c))/float(len(c))


Out[60]:
0.4166666666666667

In [61]:
print 'GradientBoosting'
gb = GradientBoostingClassifier()
print 'Fitting model'
gb.fit(X_train,y_train)
print 'Predicting on test set'
y_pred_gb = gb.predict(X_test)


GradientBoosting
Fitting model
Predicting on test set

In [63]:
from sklearn.metrics import precision_score,recall_score, confusion_matrix, classification_report,accuracy_score, f1_score
print 'Accuracy:', accuracy_score(y_test, y_pred_rf)
print 'F1 score:', f1_score(y_test, y_pred_rf)
print 'Recall:', recall_score(y_test, y_pred_rf)
print 'Precision:', precision_score(y_test, y_pred_rf)
print '\n clasification report:\n', classification_report(y_test,y_pred_rf)
print '\n confussion matrix:\n',confusion_matrix(y_test, y_pred_rf)


 Accuracy: 0.7
F1 score: 0.666666666667
Recall: 0.620689655172
Precision: 0.72

 clasification report:
             precision    recall  f1-score   support

        0.0       0.69      0.77      0.73        31
        1.0       0.72      0.62      0.67        29

avg / total       0.70      0.70      0.70        60


 confussion matrix:
[[24  7]
 [11 18]]