Generate plots of features and feature weights



In [5]:

    
import matplotlib.pyplot as plt
import numpy as np



In [6]:

    
dataset_submit = pd.read_pickle('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/dataset_submit_len_lda_wpos_wpos_readmets')
print(dataset_submit)









    



              ARI      CLI      FKG       FRE      GFI   LDAdist        LIX  \
0        8.620000  11.3423   7.6338   63.4862  11.3538  0.978843  28.384615   
1       14.007500  18.5125   7.7733   60.7050   8.1333  0.981704  28.666667   
2        7.295294   7.7276   6.3106   80.0976   9.1529  0.185355  40.529412   
3       -0.087273   1.2845   1.5727  103.3791   4.4000  0.980338  11.000000   
4        4.410000   5.6171   4.1986   89.8964   8.4571  0.978843  21.142857   
5        0.340000   2.4633  -4.2133  141.3000   8.0444  0.088626  20.111111   
6        3.338182   5.5682   2.6455   95.6882   4.4000  0.978844  20.090909   
7       16.609767   4.1858  14.3521   68.7528  19.0605  0.155225  52.302326   
8       -2.660000  -4.2033  -6.5533  147.3900   1.2000  0.445979   3.000000   
9        4.678947   3.3911   7.3463   76.2342  11.8105  0.172752  24.263158   
10       8.397368   8.0411   9.8305   58.4237  11.8105  0.976764  34.789474   
11      -0.090000   1.7600   2.8800   83.3200  10.0000  0.212839   5.000000   
12       1.410000   4.0100   3.7550   82.3900   8.2000  0.981702  33.000000   
13     -16.220000 -39.9100 -15.2000  205.8200   0.4000  0.445979   1.000000   
14       9.234706  10.1529   9.0871   60.1918  13.8588  0.202789  46.411765   
15      11.859231   8.4185  11.3423   60.0527  16.5538  0.164257  52.923077   
16       0.270000   1.3333  -1.0767  124.1550   4.8000  0.194811  12.000000   
17       5.625000   5.1117   5.8522   85.1650   9.4222  0.174978  29.111111   
18      10.475200   7.2668   7.8480   83.3240  14.8000  0.179529  37.000000   
19       7.849412   8.4206   7.6988   70.1447  11.5059  0.169103  28.764706   
20       5.998750   6.6212   6.1375   79.5575  11.4000  0.895860  28.500000   
21      10.082500  13.6042   8.7567   53.6550  11.4667  0.169100  45.333333   
22       6.414211   5.5611   5.4832   89.5921   7.6000  0.977669  29.526316   
23      18.068125  21.7144  12.7750   31.9700  18.9000  0.139409  59.750000   
24      11.591875  13.6156  13.5125   26.6825  16.4000  0.174981  34.750000   
25      12.859474  13.6211  11.6937   45.0658  16.0211  0.785448  45.315789   
26     -16.220000 -39.9100 -15.2000  205.8200   0.4000  0.445979   1.000000   
27      20.512500  13.3075  18.1167   29.2950  23.2889  0.970737  72.111111   
28      -0.087273   1.2845   1.5727  103.3791   4.4000  0.978846  20.090909   
29      -3.858000  -2.9520   0.5200  100.2400   2.0000  0.445979   5.000000   
...           ...      ...      ...       ...      ...       ...        ...   
281301   6.649000  10.0610   6.0100   69.7850   8.0000  0.124723  30.000000   
281302  29.463158  11.6770  24.4435   21.3379  25.6070  0.982657  81.561404   
281303  25.331887   8.9721  22.0008   31.7268  24.2189  0.876314  75.641509   
281304  21.059412  15.1929  20.2288   10.5897  24.1882  0.885722  69.294118   
281305  20.819565   7.6200  17.4848   51.6363  21.0087  0.962248  59.043478   
281306  16.880800  15.2772  14.9280   32.5640  19.6000  0.977757  61.000000   
281307  14.833448  10.3814  12.4028   57.7931  15.7379  0.665232  49.689655   
281308  20.291538  11.2195  17.7738   37.0962  20.7282  0.169233  64.641026   
281309   9.900000  11.5033   9.1800   34.5900  14.5333  0.732380  36.333333   
281310  18.576400  17.3976  15.4000   29.1800  14.8000  0.976069  57.000000   
281311  33.621364  11.3214  27.3136   16.7905  30.0364  0.689548  85.696970   
281312   3.910000   5.4523   7.6338   63.4862  11.3538  0.195252  28.384615   
281313   7.652500  10.5675   6.6200   54.7250  11.6000  0.111985  54.000000   
281314   3.400714   4.3550   2.5129  101.9821   5.6000  0.208495  14.000000   
281315  26.580192  11.1475  23.0708   22.2742  25.4154  0.967257  75.076923   
281316   6.763636   9.8518   9.0818   49.5427  11.6727  0.302241  38.272727   
281317  14.195455   7.2079  14.4436   50.2855  19.2606  0.044852  45.121212   
281318   9.504231   5.4735   9.0731   76.3219  13.4769  0.188102  33.692308   
281319   1.765000   3.2050   6.6200   54.7250  11.6000  0.973335  29.000000   
281320  10.335000  11.0017  10.4411   52.2650  13.8667  0.550717  34.666667   
281321  18.721538   9.2562  16.8662   43.6038  20.7282  0.093440  56.948718   
281322  13.875556  10.3574  10.2363   69.7633  13.7630  0.034595  41.814815   
281323   8.397368   8.0411   6.7253   80.6868  11.8105  0.964383  34.789474   
281324  10.195000   7.4917  11.9617   52.0500  16.2667  0.947481  36.500000   
281325  10.417000  14.7730  11.9100   27.4850  16.0000  0.986691  40.000000   
281326  -2.122500  -0.4075  -0.6700  114.1150   3.2000  0.272988   8.000000   
281327  20.322439  10.0451  15.9415   53.7956  20.3024  0.885810  60.512195   
281328  21.912128   8.3749  19.5613   38.5300  23.9064  0.915502  64.021277   
281329   0.000000   0.0000   0.0000    0.0000   0.0000  0.449594   0.000000   
281330   0.340909   1.8200   1.5727  103.3791   4.4000  0.284824  11.000000   

         RIX        SMG  highlightornot  length   sentPos   wordPos  
0        2.0  10.745967               1      13  0.069811  0.073383  
1        2.0   8.477226               0      12  0.000000  0.000000  
2        4.0   8.477226               0      16  0.001887  0.001610  
3        0.0   3.000000               0      11  0.003774  0.003756  
4        1.0   8.477226               0      14  0.005660  0.005232  
5        1.0   8.477226               0       8  0.007547  0.007110  
6        1.0   3.000000               0      11  0.009434  0.008184  
7        4.0  10.745967               0      43  0.011321  0.009659  
8        0.0   3.000000               0       3  0.013208  0.015428  
9        1.0  10.745967               0      19  0.015094  0.015830  
10       3.0  10.745967               0      19  0.016981  0.018379  
11       0.0   8.477226               0       5  0.018868  0.020928  
12       2.0   8.477226               0       8  0.020755  0.021599  
13       0.0   3.000000               0       1  0.000000  0.000000  
14       5.0  12.486833               0      16  0.024528  0.022807  
15       7.0  13.954451               0      25  0.026415  0.024953  
16       0.0   3.000000               0      11  0.028302  0.028307  
17       2.0   8.477226               0      18  0.030189  0.029783  
18       3.0  12.486833               0      24  0.032075  0.032197  
19       2.0  10.745967               0      17  0.033962  0.035417  
20       2.0  10.745967               0      16  0.035849  0.037698  
21       4.0  10.745967               0      12  0.037736  0.039844  
22       2.0   3.000000               0      19  0.039623  0.041454  
23       7.0  15.247449               0      15  0.041509  0.044003  
24       3.0  13.954451               0      16  0.043396  0.046016  
25       5.0  13.954451               0      19  0.045283  0.048162  
26       0.0   3.000000               0       1  0.000000  0.000000  
27      13.0  18.491933               0      34  0.049057  0.050845  
28       1.0   3.000000               0      10  0.050943  0.055406  
29       0.0   3.000000               0       5  0.052830  0.056748  
...      ...        ...             ...     ...       ...       ...  
281301   2.0   8.477226               0      10  0.523810  0.505956  
281302  14.0  13.954451               0      57  0.539683  0.512226  
281303  12.0  13.954451               0      52  0.555556  0.547962  
281304  12.0  19.431677               0      34  0.571429  0.580564  
281305   6.0  12.486833               0      45  0.587302  0.601881  
281306   9.0  16.416408               0      25  0.603175  0.630094  
281307   6.0  12.486833               0      29  0.619048  0.645768  
281308  10.0  15.247449               0      39  0.634921  0.663950  
281309   1.0   8.477226               0       3  0.650794  0.688401  
281310   8.0  12.486833               0      25  0.666667  0.690282  
281311  13.0  16.416408               0      64  0.682540  0.705956  
281312   2.0  10.745967               0      13  0.698413  0.746082  
281313   2.0   8.477226               0       4  0.714286  0.754232  
281314   0.0   3.000000               0      13  0.730159  0.756740  
281315  12.0  16.416408               0      52  0.746032  0.764890  
281316   3.0  10.745967               0      11  0.761905  0.797492  
281317   4.0  15.247449               0      33  0.777778  0.804389  
281318   2.0  10.745967               0      26  0.793651  0.825078  
281319   1.0   8.477226               0       4  0.809524  0.841379  
281320   3.0  12.486833               0      18  0.825397  0.843887  
281321   7.0  15.247449               0      39  0.841270  0.855172  
281322   4.0  10.745967               0      27  0.857143  0.879624  
281323   3.0  10.745967               0      19  0.873016  0.896552  
281324   3.0  13.954451               0      24  0.888889  0.908464  
281325   3.0  12.486833               0      10  0.904762  0.923511  
281326   0.0   3.000000               0       8  0.920635  0.929781  
281327   8.0  13.954451               0      41  0.936508  0.934796  
281328   8.0  16.416408               0      47  0.952381  0.960502  
281329   0.0   0.000000               0       1  0.000000  0.000000  
281330   0.0   3.000000               0      11  0.984127  0.993103  

[281331 rows x 13 columns]



In [7]:

    
hllens     = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['length']
ftlens     = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['length']
h_ldadists = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['LDAdist']
f_ldadists = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['LDAdist']
h_sposes   = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['sentPos']
f_sposes   = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['sentPos']
h_wposes   = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['wordPos']
f_wposes   = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['wordPos']
h_ARI      = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['ARI']
f_ARI      = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['ARI']
h_FRE      = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['FRE']
f_FRE      = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['FRE']
h_FKG      = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['FKG']
f_FKG      = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['FKG']
h_SMG      = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['SMG']
f_SMG      = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['SMG']
h_CLI      = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['CLI']
f_CLI      = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['CLI']
h_GFI      = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['GFI']
f_GFI      = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['GFI']
h_LIX      = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['LIX']
f_LIX      = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['LIX']
h_RIX      = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['RIX']
f_RIX      = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['RIX']
print(hllens)









    



0         13
531       23
576       12
639       20
690       28
748       25
749        5
750        5
751        9
752       10
753        6
754        9
823       23
870       14
871       15
872       30
909        9
980       46
981        1
1045      11
1046       5
1047      13
1048       5
1049       9
1050      28
1051      23
1123      17
1286      16
1287      19
1335       9
          ..
279545    39
279562     5
279563    34
279704    18
279705    23
279749     3
279750    27
279781     4
279815     9
279842     6
279889    19
279923    10
279924    13
280187    20
280220     1
280385     5
280386    22
280515    17
280569     1
280601     1
280698    12
280699     2
280893    30
280928    37
281007     1
281037    14
281129     9
281211     6
281245    15
281267     5
Name: length, Length: 5211, dtype: int64



In [9]:

    
plt.hist(hllens, bins=50, normed=1, alpha=0.5, range=(-10,140), label='highlighted')
plt.hist(ftlens, bins=50, normed=1, alpha=0.5, range=(-10,140), label='non-highlighted')
plt.title("Sentence length")
plt.legend(loc="upper right")
plt.xlabel("Number of words")
plt.ylabel("Frequency")
plt.show()

plt.hist(h_ldadists, bins=50, normed=1, range=(0,1), alpha=0.5, label='highlighted')
plt.hist(f_ldadists, bins=50, normed=1, range=(0,1), alpha=0.5, label='non-highlighted')
plt.title("Topic similarity")
plt.legend(loc="upper right")
plt.xlabel("Topic similarity score")
plt.ylabel("Frequency")
plt.show()

plt.hist(h_sposes, bins=25, normed=1, range=(0,1), alpha=0.5, label='highlighted')
plt.hist(f_sposes, bins=25, normed=1, range=(0,1), alpha=0.5, label='non-highlighted')
plt.title("Sentence position")
plt.legend(loc="upper right")
plt.xlabel("Fraction sentences into text")
plt.ylabel("Frequency")
plt.show()

plt.hist(h_wposes, bins=25, normed=1, range=(0,1), alpha=0.5, label='highlighted')
plt.hist(f_wposes, bins=25, normed=1, range=(0,1), alpha=0.5, label='non-highlighted')
plt.title("Sentence position")
plt.legend(loc="upper right")
plt.xlabel("Fraction words into text")
plt.ylabel("Frequency")
plt.show()

plt.hist(h_ARI, bins=25, normed=1, range=(0,50), alpha=0.5, label='highlighted')
plt.hist(f_ARI, bins=25, normed=1, range=(0,50), alpha=0.5, label='non-highlighted')
plt.title("Readability (ARI)")
plt.legend(loc="upper right")
plt.xlabel("ARI score")
plt.ylabel("Frequency")
plt.show()

plt.hist(h_FRE, bins=25, normed=1, range=(0,50), alpha=0.5, label='highlighted')
plt.hist(f_FRE, bins=25, normed=1, range=(0,50), alpha=0.5, label='non-highlighted')
plt.title("Readability (FRE)")
plt.legend(loc="upper right")
plt.xlabel("FRE score")
plt.ylabel("Frequency")
plt.show()

plt.hist(h_FKG, bins=25, normed=1, range=(-200,200), alpha=0.5, label='highlighted')
plt.hist(f_FKG, bins=25, normed=1, range=(-200,200), alpha=0.5, label='non-highlighted')
plt.title("Readability (FKG)")
plt.legend(loc="upper right")
plt.xlabel("FKG score")
plt.ylabel("Frequency")
plt.show()

plt.hist(h_GFI, bins=25, normed=1, range=(0,50), alpha=0.5, label='highlighted')
plt.hist(f_GFI, bins=25, normed=1, range=(0,50), alpha=0.5, label='non-highlighted')
plt.title("Readability (GFI)")
plt.legend(loc="upper right")
plt.xlabel("GFI score")
plt.ylabel("Frequency")
plt.show()

plt.hist(h_SMG, bins=25, normed=1, range=(0,30), alpha=0.5, label='highlighted')
plt.hist(f_SMG, bins=25, normed=1, range=(0,30), alpha=0.5, label='non-highlighted')
plt.title("Readability (SMG)")
plt.legend(loc="upper right")
plt.xlabel("SMG score")
plt.ylabel("Frequency")
plt.show()

plt.hist(h_CLI, bins=25, normed=1, range=(-50,50), alpha=0.5, label='highlighted')
plt.hist(f_CLI, bins=25, normed=1, range=(-50,50), alpha=0.5, label='non-highlighted')
plt.title("Readability (CLI)")
plt.legend(loc="upper right")
plt.xlabel("CLI score")
plt.ylabel("Frequency")
plt.show()

plt.hist(h_LIX, bins=25, normed=1, range=(0,100), alpha=0.5, label='highlighted')
plt.hist(f_LIX, bins=25, normed=1, range=(0,100), alpha=0.5, label='non-highlighted')
plt.title("Readability (LIX)")
plt.legend(loc="upper right")
plt.xlabel("LIX score")
plt.ylabel("Frequency")
plt.show()

plt.hist(h_RIX, bins=25, normed=1, range=(0,50), alpha=0.5, label='highlighted')
plt.hist(f_RIX, bins=25, normed=1, range=(0,50), alpha=0.5, label='non-highlighted')
plt.title("Readability (RIX)")
plt.legend(loc="upper right")
plt.xlabel("RIX score")
plt.ylabel("Frequency")
plt.show()



In [21]:

    
feature_weights = [-0.75450588,  0.00378554 , -0.14955534 ,  0.17832368 , -0.06008622 ,  0.69844673 ,  1.1656712  ,  0.01445301 ,  0.10138341 ,  0.12576411 ,  0.16656827 , -0.75734814 ]
print(sorted(feature_weights))
feature_names = ['length', 'LDAdist', 'wordPos', 'sentPos', 'ARI', 'FRE', 'FKG', 'GFI', 'SMG', 'CLI', 'LIX', 'RIX']









    



[-0.75734814, -0.75450588, -0.14955534, -0.06008622, 0.00378554, 0.01445301, 0.10138341, 0.12576411, 0.16656827, 0.17832368, 0.69844673, 1.1656712]



In [26]:

    
ind = np.arange(12)
fig, ax = plt.subplots()
rects1 = plt.bar(ind, sorted(feature_weights, reverse=True))

# add some text for labels, title and axes ticks
plt.ylabel('Weight')
plt.title('Feature weights')
# plt.xlabel('Feature')
# ax.set_xticks(ind + width / 2)
# plt.xticklabels(('FKG', 'FRE', 'sentPos', 'LIX', 'CLI', 'SMOG', 'GFI', 'LDAdist', 'ARI', 'wordPos', 'length', 'RIX'))
labels = [item.get_text() for item in ax.get_xticklabels()]
labels = ['FKG', 'FRE', 'sentPos', 'LIX', 'CLI', 'SMOG', 'GFI', 'LDAdist', 'ARI', 'wordPos', 'length', 'RIX']

ax.set_xticklabels(labels)

plt.show()



In [27]:

    
plt.hist(hllens, bins=50, normed=1, alpha=0.5, range=(-10,140), label='highlighted')
# plt.hist(ftlens, bins=50, normed=1, alpha=0.5, range=(-10,140), label='non-highlighted')
plt.title("Sentence length")
# plt.legend(loc="upper right")
plt.xlabel("Number of words")
plt.ylabel("Frequency")
plt.show()

plt.hist(h_ldadists, bins=50, normed=1, range=(0,1), alpha=0.5, label='highlighted')
# plt.hist(f_ldadists, bins=50, normed=1, range=(0,1), alpha=0.5, label='non-highlighted')
plt.title("Topic similarity")
# plt.legend(loc="upper right")
plt.xlabel("Topic similarity score")
plt.ylabel("Frequency")
plt.show()

plt.hist(h_sposes, bins=25, normed=1, range=(0,1), alpha=0.5, label='highlighted')
# plt.hist(f_sposes, bins=25, normed=1, range=(0,1), alpha=0.5, label='non-highlighted')
plt.title("Sentence position")
# plt.legend(loc="upper right")
plt.xlabel("Fraction sentences into text")
plt.ylabel("Frequency")
plt.show()


# for item in p:
#     item.set_height(item.get_height()/sum(x))



In [ ]:



In [ ]:



In [3]:

    
# plot feature weights for logistic regression + SMOTE
feature_weights_smote = [-0.53389911, 0.0017571 ,-0.06211337, 0.0919522 ,-0.07619093, 0.59428718, 0.9357417 ,-0.0548295 , 0.1296457 , 0.0961243 , 0.17552932,-0.59777636]
print(sorted(feature_weights_smote))
feature_names_smote = ['length', 'LDAdist', 'wordPos', 'sentPos', 'ARI', 'FRE', 'FKG', 'GFI', 'SMG', 'CLI', 'LIX', 'RIX']









    



[-0.59777636, -0.53389911, -0.07619093, -0.06211337, -0.0548295, 0.0017571, 0.0919522, 0.0961243, 0.1296457, 0.17552932, 0.59428718, 0.9357417]



In [4]:

    
ind = np.arange(12)
fig, ax = plt.subplots()
rects1 = plt.bar(ind, sorted(feature_weights_smote, reverse=True))

# add some text for labels, title and axes ticks
plt.ylabel('Weight')
plt.title('Feature weights, SMOTE')
# plt.xlabel('Feature')
# ax.set_xticks(ind + width / 2)
# plt.xticklabels(('FKG', 'FRE', 'sentPos', 'LIX', 'CLI', 'SMOG', 'GFI', 'LDAdist', 'ARI', 'wordPos', 'length', 'RIX'))
labels = [item.get_text() for item in ax.get_xticklabels()]
labels = ['FKG', 'FRE', 'LIX', 'SMG', 'CLI', 'sentPos', 'LDAdist', 'GFI', 'wordPos', 'ARI', 'length', 'RIX']

ax.set_xticklabels(labels)

plt.show()



In [ ]: