In [5]:
import matplotlib.pyplot as plt
import numpy as np
In [6]:
dataset_submit = pd.read_pickle('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/dataset_submit_len_lda_wpos_wpos_readmets')
print(dataset_submit)
In [7]:
hllens = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['length']
ftlens = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['length']
h_ldadists = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['LDAdist']
f_ldadists = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['LDAdist']
h_sposes = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['sentPos']
f_sposes = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['sentPos']
h_wposes = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['wordPos']
f_wposes = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['wordPos']
h_ARI = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['ARI']
f_ARI = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['ARI']
h_FRE = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['FRE']
f_FRE = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['FRE']
h_FKG = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['FKG']
f_FKG = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['FKG']
h_SMG = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['SMG']
f_SMG = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['SMG']
h_CLI = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['CLI']
f_CLI = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['CLI']
h_GFI = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['GFI']
f_GFI = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['GFI']
h_LIX = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['LIX']
f_LIX = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['LIX']
h_RIX = dataset_submit.loc[dataset_submit['highlightornot'] == 1]['RIX']
f_RIX = dataset_submit.loc[dataset_submit['highlightornot'] == 0]['RIX']
print(hllens)
In [9]:
plt.hist(hllens, bins=50, normed=1, alpha=0.5, range=(-10,140), label='highlighted')
plt.hist(ftlens, bins=50, normed=1, alpha=0.5, range=(-10,140), label='non-highlighted')
plt.title("Sentence length")
plt.legend(loc="upper right")
plt.xlabel("Number of words")
plt.ylabel("Frequency")
plt.show()
plt.hist(h_ldadists, bins=50, normed=1, range=(0,1), alpha=0.5, label='highlighted')
plt.hist(f_ldadists, bins=50, normed=1, range=(0,1), alpha=0.5, label='non-highlighted')
plt.title("Topic similarity")
plt.legend(loc="upper right")
plt.xlabel("Topic similarity score")
plt.ylabel("Frequency")
plt.show()
plt.hist(h_sposes, bins=25, normed=1, range=(0,1), alpha=0.5, label='highlighted')
plt.hist(f_sposes, bins=25, normed=1, range=(0,1), alpha=0.5, label='non-highlighted')
plt.title("Sentence position")
plt.legend(loc="upper right")
plt.xlabel("Fraction sentences into text")
plt.ylabel("Frequency")
plt.show()
plt.hist(h_wposes, bins=25, normed=1, range=(0,1), alpha=0.5, label='highlighted')
plt.hist(f_wposes, bins=25, normed=1, range=(0,1), alpha=0.5, label='non-highlighted')
plt.title("Sentence position")
plt.legend(loc="upper right")
plt.xlabel("Fraction words into text")
plt.ylabel("Frequency")
plt.show()
plt.hist(h_ARI, bins=25, normed=1, range=(0,50), alpha=0.5, label='highlighted')
plt.hist(f_ARI, bins=25, normed=1, range=(0,50), alpha=0.5, label='non-highlighted')
plt.title("Readability (ARI)")
plt.legend(loc="upper right")
plt.xlabel("ARI score")
plt.ylabel("Frequency")
plt.show()
plt.hist(h_FRE, bins=25, normed=1, range=(0,50), alpha=0.5, label='highlighted')
plt.hist(f_FRE, bins=25, normed=1, range=(0,50), alpha=0.5, label='non-highlighted')
plt.title("Readability (FRE)")
plt.legend(loc="upper right")
plt.xlabel("FRE score")
plt.ylabel("Frequency")
plt.show()
plt.hist(h_FKG, bins=25, normed=1, range=(-200,200), alpha=0.5, label='highlighted')
plt.hist(f_FKG, bins=25, normed=1, range=(-200,200), alpha=0.5, label='non-highlighted')
plt.title("Readability (FKG)")
plt.legend(loc="upper right")
plt.xlabel("FKG score")
plt.ylabel("Frequency")
plt.show()
plt.hist(h_GFI, bins=25, normed=1, range=(0,50), alpha=0.5, label='highlighted')
plt.hist(f_GFI, bins=25, normed=1, range=(0,50), alpha=0.5, label='non-highlighted')
plt.title("Readability (GFI)")
plt.legend(loc="upper right")
plt.xlabel("GFI score")
plt.ylabel("Frequency")
plt.show()
plt.hist(h_SMG, bins=25, normed=1, range=(0,30), alpha=0.5, label='highlighted')
plt.hist(f_SMG, bins=25, normed=1, range=(0,30), alpha=0.5, label='non-highlighted')
plt.title("Readability (SMG)")
plt.legend(loc="upper right")
plt.xlabel("SMG score")
plt.ylabel("Frequency")
plt.show()
plt.hist(h_CLI, bins=25, normed=1, range=(-50,50), alpha=0.5, label='highlighted')
plt.hist(f_CLI, bins=25, normed=1, range=(-50,50), alpha=0.5, label='non-highlighted')
plt.title("Readability (CLI)")
plt.legend(loc="upper right")
plt.xlabel("CLI score")
plt.ylabel("Frequency")
plt.show()
plt.hist(h_LIX, bins=25, normed=1, range=(0,100), alpha=0.5, label='highlighted')
plt.hist(f_LIX, bins=25, normed=1, range=(0,100), alpha=0.5, label='non-highlighted')
plt.title("Readability (LIX)")
plt.legend(loc="upper right")
plt.xlabel("LIX score")
plt.ylabel("Frequency")
plt.show()
plt.hist(h_RIX, bins=25, normed=1, range=(0,50), alpha=0.5, label='highlighted')
plt.hist(f_RIX, bins=25, normed=1, range=(0,50), alpha=0.5, label='non-highlighted')
plt.title("Readability (RIX)")
plt.legend(loc="upper right")
plt.xlabel("RIX score")
plt.ylabel("Frequency")
plt.show()
In [21]:
feature_weights = [-0.75450588, 0.00378554 , -0.14955534 , 0.17832368 , -0.06008622 , 0.69844673 , 1.1656712 , 0.01445301 , 0.10138341 , 0.12576411 , 0.16656827 , -0.75734814 ]
print(sorted(feature_weights))
feature_names = ['length', 'LDAdist', 'wordPos', 'sentPos', 'ARI', 'FRE', 'FKG', 'GFI', 'SMG', 'CLI', 'LIX', 'RIX']
In [26]:
ind = np.arange(12)
fig, ax = plt.subplots()
rects1 = plt.bar(ind, sorted(feature_weights, reverse=True))
# add some text for labels, title and axes ticks
plt.ylabel('Weight')
plt.title('Feature weights')
# plt.xlabel('Feature')
# ax.set_xticks(ind + width / 2)
# plt.xticklabels(('FKG', 'FRE', 'sentPos', 'LIX', 'CLI', 'SMOG', 'GFI', 'LDAdist', 'ARI', 'wordPos', 'length', 'RIX'))
labels = [item.get_text() for item in ax.get_xticklabels()]
labels = ['FKG', 'FRE', 'sentPos', 'LIX', 'CLI', 'SMOG', 'GFI', 'LDAdist', 'ARI', 'wordPos', 'length', 'RIX']
ax.set_xticklabels(labels)
plt.show()
In [27]:
plt.hist(hllens, bins=50, normed=1, alpha=0.5, range=(-10,140), label='highlighted')
# plt.hist(ftlens, bins=50, normed=1, alpha=0.5, range=(-10,140), label='non-highlighted')
plt.title("Sentence length")
# plt.legend(loc="upper right")
plt.xlabel("Number of words")
plt.ylabel("Frequency")
plt.show()
plt.hist(h_ldadists, bins=50, normed=1, range=(0,1), alpha=0.5, label='highlighted')
# plt.hist(f_ldadists, bins=50, normed=1, range=(0,1), alpha=0.5, label='non-highlighted')
plt.title("Topic similarity")
# plt.legend(loc="upper right")
plt.xlabel("Topic similarity score")
plt.ylabel("Frequency")
plt.show()
plt.hist(h_sposes, bins=25, normed=1, range=(0,1), alpha=0.5, label='highlighted')
# plt.hist(f_sposes, bins=25, normed=1, range=(0,1), alpha=0.5, label='non-highlighted')
plt.title("Sentence position")
# plt.legend(loc="upper right")
plt.xlabel("Fraction sentences into text")
plt.ylabel("Frequency")
plt.show()
# for item in p:
# item.set_height(item.get_height()/sum(x))
In [ ]:
In [ ]:
In [3]:
# plot feature weights for logistic regression + SMOTE
feature_weights_smote = [-0.53389911, 0.0017571 ,-0.06211337, 0.0919522 ,-0.07619093, 0.59428718, 0.9357417 ,-0.0548295 , 0.1296457 , 0.0961243 , 0.17552932,-0.59777636]
print(sorted(feature_weights_smote))
feature_names_smote = ['length', 'LDAdist', 'wordPos', 'sentPos', 'ARI', 'FRE', 'FKG', 'GFI', 'SMG', 'CLI', 'LIX', 'RIX']
In [4]:
ind = np.arange(12)
fig, ax = plt.subplots()
rects1 = plt.bar(ind, sorted(feature_weights_smote, reverse=True))
# add some text for labels, title and axes ticks
plt.ylabel('Weight')
plt.title('Feature weights, SMOTE')
# plt.xlabel('Feature')
# ax.set_xticks(ind + width / 2)
# plt.xticklabels(('FKG', 'FRE', 'sentPos', 'LIX', 'CLI', 'SMOG', 'GFI', 'LDAdist', 'ARI', 'wordPos', 'length', 'RIX'))
labels = [item.get_text() for item in ax.get_xticklabels()]
labels = ['FKG', 'FRE', 'LIX', 'SMG', 'CLI', 'sentPos', 'LDAdist', 'GFI', 'wordPos', 'ARI', 'length', 'RIX']
ax.set_xticklabels(labels)
plt.show()
In [ ]: