Train a bag-of-words classifier (Multinomial Naive Bayes), as for spam/ham filters, as first pass at distinguishing between highlighted and non-highlighted sentences.
This is unlikely to be very accurate, since highlighted sentences seem to be more similar in writing style to the sentences around them than spam e-mail text is to normal e-mail text.
In [2]:
import matplotlib.pyplot as plt
import csv
from textblob import TextBlob, Word
import pandas as pd
import sklearn
import pickle
import numpy as np
import nltk.data
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import learning_curve, GridSearchCV, StratifiedKFold, cross_val_score, train_test_split
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
In [3]:
dict_all = pickle.load(open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/dict_all','rb'))
data = pickle.load(open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/data_pd','rb'))
sent_all = pickle.load(open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/sent_all','rb'))
In [4]:
print(len(sent_all))
In [6]:
# Tokenize highlights into sentences
n = 0
high_all = []
high_mat = []
for i in data['ids']:
high = tokenizer.tokenize(data['highlights'][n])
for j in high:
high_all.append(j) # collect all sentences from all highlights into one list
high_mat.append([i, j, 'hl'])
n += 1
print(len(high_all))
print(len(high_mat))
print(high_mat[0])
In [7]:
n = 0
sent_tmp = []
sent_mat = []
for i in data['ids']:
full = str(' '.join(data['text'][n]))
high = data['highlights'][n]
fnoh = full.replace(high,' ')
sent = tokenizer.tokenize(fnoh)
for j in sent:
sent_tmp.append(j) # collect all sentences from all full texts into one list
sent_mat.append([i, j, 'no'])
n+=1
print(len(sent_tmp))
print(len(sent_mat))
print(sent_mat[0])
sent_tmp == sent_all
Out[7]:
In [8]:
allbag = high_mat + sent_mat
# print(allbag[6002:6005])
d_allbag = pd.DataFrame({'ids':[i[0] for i in allbag], 'sentence':[i[1] for i in allbag], 'tag':[i[2] for i in allbag]})
d_allbag.groupby('tag').describe()
# NOTE: what's up with non-unique sentences within tags? are there really ~40 sentences the same between all
# highlights among all articles? Look into this?
Out[8]:
In [32]:
seen = set()
seen_add = seen.add
text_unq = []
id_unq = [] # store ids of first unique highlights
id_non = [] # store ids of non-unique highlights
idnum = 0
for x in d_allbag['sentence']: # get unique highlights, preserving order
if x in seen:
id_non.append(d_allbag['ids'][idnum])
idnum += 1
continue
seen_add(x)
text_unq.append(x)
id_unq.append(d_allbag['ids'][idnum])
idnum += 1
if idnum == 5210:
break
print(id_unq[49]) # check that id_hl_uniq matches htext_uniq -- it does!!
print(text_unq[49])
print(len(id_unq))
print(len(text_unq))
In [ ]:
d_allbag['length'] = d_allbag['sentence'].map(lambda text: len(text))
print(d_allbag.head())
In [10]:
plot = d_allbag.hist(column='length', by='tag', bins=50)#, xlim=[0, 800], ylim=[0,800])
# plt.set_xlim((0,800))
# plot[0][1].set_xlim((0,800))
plt.show()
In [11]:
def tokenize_words(txt):
txt = txt.lower() # convert bytes into proper unicode
return TextBlob(txt).words
def split_into_lemmas(sent):
sent = sent.lower()
words = TextBlob(sent).words
# for each word, take its "base form" = lemma
return [word.lemma for word in words]
# d_allbag2 = d_allbag.copy()
# print(d_allbag.head())
print(d_allbag.sentence.head().apply(split_into_lemmas))
# print(d_allbag.sentence.head().apply(tokenize_words))
d_allbag.sentence.apply(split_into_lemmas)
Out[11]:
In [12]:
# f_allbag = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/d_allbag','wb')
# pickle.dump(d_allbag, f_allbag)
In [13]:
print('test')
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(d_allbag['sentence'])
# count vectorizer performed on lemmas, fit to all sentences
print(len(bow_transformer.vocabulary_))
In [22]:
test10 = d_allbag['sentence'][9]
print(test10)
bow10 = bow_transformer.transform([test10])
print(bow10)
print(bow10.shape)
print( bow_transformer.get_feature_names()[80731] ) # what appears twice?
sents_bow = bow_transformer.transform(d_allbag['sentence'])
print( 'sparse matrix shape:', sents_bow.shape)
print( 'number of non-zeros:', sents_bow.nnz)
print( 'sparsity: %.2f%%' % (100.0 * sents_bow.nnz / (sents_bow.shape[0] * sents_bow.shape[1])))
In [25]:
tfidf_transformer = TfidfTransformer().fit(sents_bow)
tfidf10 = tfidf_transformer.transform(bow10)
print(tfidf10)
print( tfidf_transformer.idf_[bow_transformer.vocabulary_['u']])
print( tfidf_transformer.idf_[bow_transformer.vocabulary_['university']])
print( tfidf_transformer.idf_[bow_transformer.vocabulary_['you']])
In [27]:
sents_tfidf = tfidf_transformer.transform(sents_bow)
print( sents_tfidf.shape)
In [44]:
%time highlighter = MultinomialNB().fit(sents_tfidf, d_allbag['tag'])
In [50]:
print( 'predicted:', highlighter.predict(tfidf10)[0])
print( 'expected:', d_allbag.tag[9])
all_predictions = highlighter.predict(sents_tfidf)
print( all_predictions)
print( 'accuracy', accuracy_score(d_allbag['tag'], all_predictions))
print( 'confusion matrix\n', confusion_matrix(d_allbag['tag'], all_predictions))
print( '(row=expected, col=predicted)')
plt.matshow(confusion_matrix(d_allbag['tag'], all_predictions), cmap=plt.cm.binary, interpolation='nearest')
plt.title('confusion matrix')
plt.colorbar()
plt.ylabel('expected label')
plt.xlabel('predicted label')
plt.show()
In [52]:
print( classification_report(d_allbag['tag'], all_predictions))
In [35]:
sent_train, sent_test, tag_train, tag_test = \
train_test_split(d_allbag['sentence'], d_allbag['tag'], test_size=0.2)
print( len(sent_train), len(sent_test), len(sent_train) + len(sent_test))
In [43]:
print(tag_test.head())
print(sent_test.head())
len(tag_test.tag[0])
# print(tag_test.count('hl'))
# print(tag_train.count('hl'))
# print(tag_test.count('no'))
# print(tag_train.count('no'))
In [ ]: