In [1]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.cross_validation import KFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import scale
from sklearn.metrics.pairwise import cosine_similarity
from sumy import evaluation
from sumy.models import dom
from sumy.nlp import tokenizers
from stemming.porter2 import stem
from os import listdir
import os.path
from nltk.corpus import stopwords
import nltk
import copy
import pickle
import unicodedata
import re
import numpy as np
import operator
import matplotlib.pyplot as plt
from collections import defaultdict
import nltk.data
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
%matplotlib inline
Feature Extraction:
<img src=".\Others\Features.png" alt="HTML5 Icon" width="800" height="500", style="display: ;">
In [2]:
data_root_dir = '../data/DUC2001'
annotation_file = 'annotations.txt'
txt_opn_tag = '<TEXT>'
txt_close_tag = '</TEXT>'
In [3]:
def get_cluster_and_its_files(data_root_dir,annotation_file):
'''Get a Cluster and the file names associated with it
Returns a dictionary of the form { cluster_1 : [file1,file2,file3....], cluster_2 : [file1,file2,file3....] }'''
f = open(data_root_dir + '/' + annotation_file,'r')
clust_files = defaultdict(list)
for line in f.readlines():
cur_line = line.split(';')[0]
clust_name = cur_line.split('@')[1]
file_name = cur_line.split('@')[0]
clust_files[clust_name].append(file_name)
f.close()
return clust_files
In [4]:
clust_files = get_cluster_and_its_files(data_root_dir,annotation_file)
print clust_files['mad cow disease']
clust_list = clust_files.keys()
In [5]:
def get_text_from_doc(document_path,txt_opn_tag,txt_close_tag):
f = open(document_path,'r')
content = f.read()
f.close()
start = content.index(txt_opn_tag) + len(txt_opn_tag)
end = content.index(txt_close_tag)
return content[start:end]
In [6]:
def tokenize_txt(text,nltk_flag=True,ner_flag=False):
text = text.strip()
if ner_flag == True:
tokenizedList = re.split('[^a-zA-Z]+', text.lower())
return tokenizedList
if nltk_flag == False:
#return [x.lower() for x in re.findall(r"\w+", text)]
tokenizedList = re.split('\W+', text.lower())
return [unicode(x,'utf-8') for x in tokenizedList if x != '' and x != '\n' and x != u'\x85' and x != '\r' and x != '_']
else:
return nltk.word_tokenize(unicode(text,'utf-8'))
#return [x for x in toks if x != '' and x != '\n' and x != u'\x85' and x != '\r' and x != '_' and x!= ',' and x != '.']
In [7]:
tokenize_txt('What is this ?? Is this _ cool ? I don\'t know',nltk_flag=True,ner_flag=True)
Out[7]:
Feature 1 : Term frequency over the cluster(TF)
In [8]:
def get_term_freqs(data_root_dir,annotation_file,stop_words=None) :
'''Get the term freqs of words in clusters. The term freqs are unique to clusters.
Returns a dict of form {clust1 : {word1 : 2, word2 :3...},clust2 : {word1 : 2, word2 :3..} ......}'''
#Check about stop_words
clust_files = get_cluster_and_its_files(data_root_dir,annotation_file)
clust_term_freq = defaultdict(defaultdict)
for clust,files in clust_files.iteritems():
term_freq = defaultdict(int)
for doc in files:
doc_path = data_root_dir + '/' + doc
txt = get_text_from_doc(doc_path,txt_opn_tag,txt_close_tag)
doc_tokens = tokenize_txt(txt)
for token in doc_tokens:
term_freq[token] += 1
clust_term_freq[clust] = term_freq
return clust_term_freq
In [9]:
clust_word_tfs = get_term_freqs(data_root_dir,annotation_file)
print clust_word_tfs['cattle disease']
Feature 2 : Total document number in the datasets, divided by the frequency of documents which contains this word (IDF)
In [10]:
def get_doc_freqs(data_root_dir,annotation_file):
'''Return a dictionary of the form {word1 : df1 , word2 : df2 ...}'''
'''Example : {furazabol : 154.5 , the : 1.00032}'''
data_root_dir += '/'
docs = [file_name for _,__,file_name in os.walk(data_root_dir)][0]
if annotation_file in docs:
docs.remove(annotation_file)
inverted_index = defaultdict(set)
for doc in docs:
doc_path = data_root_dir + doc
txt = get_text_from_doc(doc_path,txt_opn_tag,txt_close_tag)
doc_tokens = tokenize_txt(txt)
for token in doc_tokens:
inverted_index[token].add(doc)
no_of_docs = len(docs)
idf_dict = defaultdict(float)
for term,doc_lst in inverted_index.iteritems():
idf_dict[term] = float(no_of_docs) / len(doc_lst)
return idf_dict
In [11]:
doc_freqs = get_doc_freqs(data_root_dir,annotation_file)
print doc_freqs['furazabol']
print doc_freqs['the']
Feature 3 : The frequency of documents which contains this word in the current cluster (CF)
In [12]:
def get_clusterwise_dfs(data_root_dir,annotation_file):
'''Return a dictionary of the form : {clust1 : (word1 : df1,word2 :df2 .....) , clust1 : (word3 : df3,word2 :df3 .....)}'''
'''Note that the document frequencies of term are calculated clusterwise, and not on the whole dataset'''
clust_doc_freqs = defaultdict(defaultdict)
clust_files = get_cluster_and_its_files(data_root_dir,annotation_file)
for clust,files in clust_files.iteritems():
inverted_index = defaultdict(set)
for doc in files:
doc_path = data_root_dir + '/' + doc
txt = get_text_from_doc(doc_path,txt_opn_tag,txt_close_tag)
doc_tokens = tokenize_txt(txt)
for token in doc_tokens:
inverted_index[token].add(doc)
clust_df = defaultdict(int)
for term,doc_lst in inverted_index.iteritems():
clust_df[term] = len(doc_lst)
clust_doc_freqs[clust] = clust_df
return clust_doc_freqs
In [13]:
clust_dfs = get_clusterwise_dfs(data_root_dir,annotation_file)
print sorted(clust_dfs['mad cow disease'].items(),key=operator.itemgetter(1),reverse=True)[0:20]
Feature 4 : A 4-dimension binary vector indicates whether the word is a noun, a verb, an adjective or an adverb. If the word has another part-of-speech, the vector is all-zero (POS)
In [14]:
def get_short_tag(long_tag,valid_pos=['NN','VB','JJ','RB']):
'''Truncate long_tag to get its first 2 chars. If a valid POS, return first 2 chars. else return OT (Other)'''
'''Valid POS are NN,VB,JJ,RB'''
valid_pos_lst = valid_pos
long_tag = str.upper(long_tag[0:2])
if long_tag in valid_pos_lst:
return long_tag
else:
return 'OT'
In [15]:
def get_sentence_tags(sentence):
'''POS tag the words in the sentence and return a dict of the form : {word1 : [tag1,tag2..], word2 : [tag3,tag4..]..}'''
word_tag_dict = defaultdict(set)
#sent_tags = pos_tagger.tag(tokenize_txt(sentence))
sent_tags = nltk.pos_tag(tokenize_txt(sentence))
for word_tag in sent_tags:
word = word_tag[0]
tag = word_tag[1]
word_tag_dict[word].add(get_short_tag(tag))
return word_tag_dict
In [16]:
print get_sentence_tags("sent one")
print get_sentence_tags("sent two")
In [17]:
def get_doc_tags(document):
'''Perform POS tagging on all the sentences in the document and return a dict of the form :'''
''' (sent_id : { word1 : tag1 ...}...}'''
sent_and_tags = defaultdict(int)
#sentences = document.split('.')
sentences = sent_detector.tokenize(document,realign_boundaries=True)
for i,sentence in enumerate(sentences):
sent_and_tags[i] = get_sentence_tags(sentence.strip('.').strip('\n'.strip('')))
return sent_and_tags
In [18]:
get_doc_tags("Who is Alan Turing ??. Alan was born in the United Kingdom")
Out[18]:
In [19]:
def get_cluster_tags(data_root_dir,annotation_file):
'''Perfom Part of Speech Tagging across all the sentences in all the documents in all the clusters'''
clust_files = get_cluster_and_its_files(data_root_dir,annotation_file)
clust_tags = defaultdict(defaultdict)
i = 1
for clust,files in clust_files.iteritems():
for doc in files:
if i %10 == 0:
print 'Finished tagging doc :', i
i += 1
doc_path = data_root_dir + '/' + doc
txt = get_text_from_doc(doc_path,txt_opn_tag,txt_close_tag)
clust_tags[clust][doc] = get_doc_tags(txt)
return clust_tags
In [22]:
clust_tags = get_cluster_tags(data_root_dir,annotation_file)
In [20]:
def serialize(file_name,data):
with open(file_name, 'wb') as f:
pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
In [21]:
def deserialize(file_name):
with open(file_name, 'rb') as f:
return pickle.load(f)
In [22]:
file_name = 'pos_tags.pickle'
#serialize(file_name,clust_tags)
clust_tags = deserialize(file_name)
print 'done'
In [23]:
old_cpy = copy.deepcopy(clust_tags)
In [24]:
def vectorize_pos(pos_set,pos_idx = {'NN' : 0 ,'VB' : 1,'JJ' : 2,'RB' : 3}):
'''Convert the POS set to a binary vector according to pos_idx'''
bin_pos_vec = 4*[False]
for pos in pos_set:
if pos == 'OT':
return 4*[False]
else:
bin_pos_vec[pos_idx[pos]] = True
return bin_pos_vec
In [25]:
print vectorize_pos({'NN','RB'})
print vectorize_pos({'NN','RB','JJ','VB','OT'})
In [26]:
def vectorize_tags_across_clusters(clust_tags):
'''Binarize the POS of words'''
for clust,doc in clust_tags.iteritems():
doc_sent = defaultdict(defaultdict)
for doc,sent in doc.iteritems():
sent_word = defaultdict(defaultdict)
for sen_id,word_pos in sent.iteritems():
for word,pos in word_pos.iteritems():
word_pos[word] = copy.deepcopy(vectorize_pos(pos))
sent_word[sen_id] = copy.deepcopy(word_pos)
doc_sent[doc] = copy.deepcopy(sent_word)
clust_tags[clust] = copy.deepcopy(doc_sent)
return clust_tags
In [27]:
new_clust_tags = vectorize_tags_across_clusters(clust_tags)
In [28]:
print old_cpy['mad cow disease']['LA060490-0083'][2],'\n\n'
print new_clust_tags['mad cow disease']['LA060490-0083'][2]
Feature 5 : A binary value equals one iff the output of the named entity classifier from CoreNLP is not empty (Named Entity)
In [29]:
def extract_ners(data_root_dir,annotation_file):
'''Perform Named Entity Recognition on all sentences in all docs in all clusters'''
clust_files = get_cluster_and_its_files(data_root_dir,annotation_file)
clust_doc = defaultdict(defaultdict)
for clust,files in clust_files.iteritems():
doc_sent = defaultdict(defaultdict)
for file_name in files:
file_path = data_root_dir + '/' + file_name
doc = get_text_from_doc(file_path,txt_opn_tag,txt_close_tag)
sentences = sent_detector.tokenize(doc)
sent_tokens =[tokenize_txt(sent,nltk_flag=True,ner_flag=True) for sent in sentences]
sent_ner_cnt = defaultdict(int)
for s_id,tok_sent in enumerate(sent_tokens):
ners = ner_tagger.tag(tok_sent)
cnt = 0
for ner in ners:
if ner[1] != 'O':
cnt += 1
sent_ner_cnt[s_id] = cnt
doc_sent[file_name] = copy.deepcopy(sent_ner_cnt)
print 'FINISHED NER ON ', file_name
clust_doc[clust] = copy.deepcopy(doc_sent)
return clust_doc
In [30]:
file_name = 'ner_tags.pickle'
#serialize(file_name,clust_ners)
clust_ners = deserialize(file_name)
print 'done'
In [31]:
clust_ners['mad cow disease']['LA060490-0083']
Out[31]:
Feature 6 : A binary value denotes if a word in Number (Number)
In [32]:
def extract_digit_cnt(data_root_dir,annotation_file,cnt_ratio_flag='C'):
'''Count the number of digits in a sentence'''
clust_files = get_cluster_and_its_files(data_root_dir,annotation_file)
clust_doc = defaultdict(defaultdict)
for clust,files in clust_files.iteritems():
doc_sent = defaultdict(defaultdict)
for file_name in files:
file_path = data_root_dir + '/' + file_name
doc = get_text_from_doc(file_path,txt_opn_tag,txt_close_tag)
sentences = sent_detector.tokenize(doc)
sent_tokens =[tokenize_txt(sent) for sent in sentences]
sent_dig_cnt = defaultdict(int)
dig_cnt = 0
for s_id,tok_sent in enumerate(sent_tokens):
for tok in tok_sent:
if tok.isdigit():
dig_cnt += 1
if cnt_ratio_flag == 'C':
sent_dig_cnt[s_id] = dig_cnt
else:
sent_dig_cnt[s_id] = float(dig_cnt)/len(tok_sent)
doc_sent[file_name] = copy.deepcopy(sent_dig_cnt)
clust_doc[clust] = copy.deepcopy(doc_sent)
return clust_doc
In [33]:
clust_digs = extract_digit_cnt(data_root_dir,annotation_file)
print 'done'
In [34]:
print clust_digs['mad cow disease']['LA060490-0083'][29]
Feature 22 : The number of digits, divided by the sentence length(Number ratio)
In [35]:
clust_dig_ratio = extract_digit_cnt(data_root_dir,annotation_file,'R')
print 'done'
clust_dig_ratio['mad cow disease']['LA060490-0083']
Out[35]:
Feature 23 : The number of stop words, divided by the sentence length(Stop word ratio)
In [36]:
def stop_word_ratio(data_root_dir,annotation_file):
'''Compute the stop word ratio for all sentences'''
'''stop word ratio == no of stop words in sent / len(sent) '''
english_stopwords = set(stopwords.words('english'))
clust_files = get_cluster_and_its_files(data_root_dir,annotation_file)
clust_doc = defaultdict(defaultdict)
for clust,files in clust_files.iteritems():
doc_sent = defaultdict(defaultdict)
for file_name in files:
file_path = data_root_dir + '/' + file_name
doc = get_text_from_doc(file_path,txt_opn_tag,txt_close_tag)
sentences = sent_detector.tokenize(doc)
sent_tokens =[tokenize_txt(sent) for sent in sentences]
sent_dig_cnt = defaultdict(int)
for s_id,tok_sent in enumerate(sent_tokens):
stop_cnt = 0
for tok in tok_sent:
if tok.lower() in english_stopwords:
stop_cnt += 1
sent_dig_cnt[s_id] = float(stop_cnt)/len(tok_sent)
doc_sent[file_name] = copy.deepcopy(sent_dig_cnt)
clust_doc[clust] = copy.deepcopy(doc_sent)
return clust_doc
In [37]:
clust_stop_word_ratio = stop_word_ratio(data_root_dir,annotation_file)
print 'done'
print clust_stop_word_ratio['mad cow disease']['LA060490-0083'][18]
Feature 24 : No of words in the sentence (Sentence Length)
In [38]:
def sent_length(data_root_dir,annotation_file):
'''Compute the Lenght of sentences and store them in a dictionary'''
clust_files = get_cluster_and_its_files(data_root_dir,annotation_file)
clust_doc = defaultdict(defaultdict)
for clust,files in clust_files.iteritems():
doc_sent = defaultdict(defaultdict)
for file_name in files:
file_path = data_root_dir + '/' + file_name
doc = get_text_from_doc(file_path,txt_opn_tag,txt_close_tag)
sentences = sent_detector.tokenize(doc)
sent_tokens =[tokenize_txt(sent) for sent in sentences]
sent_dig_cnt = defaultdict(int)
for s_id,tok_sent in enumerate(sent_tokens):
sent_dig_cnt[s_id] = len(tok_sent)
doc_sent[file_name] = copy.deepcopy(sent_dig_cnt)
clust_doc[clust] = copy.deepcopy(doc_sent)
return clust_doc
In [39]:
clust_sent_lens = sent_length(data_root_dir,annotation_file)
print 'done'
print clust_sent_lens['mad cow disease']['LA060490-0083'][15]
In [40]:
file_path = data_root_dir + '/' + 'LA060490-0083'
doc = get_text_from_doc(file_path,txt_opn_tag,txt_close_tag)
sentences = sent_detector.tokenize(doc)
print len(sentences[15].split(' '))
Feature 21 : The number of named entities divided by sentence length (NER Ratio)
In [41]:
def ner_ratio(data_root_dir,annotation_file,clust_ners,clust_sent_lens):
'''Compute the Ratio of NERS : Sentence lenght and store them in a dictionary'''
clust_files = get_cluster_and_its_files(data_root_dir,annotation_file)
clust_doc = defaultdict(defaultdict)
for clust,files in clust_files.iteritems():
doc_sent = defaultdict(defaultdict)
for file_name in files:
file_path = data_root_dir + '/' + file_name
doc = get_text_from_doc(file_path,txt_opn_tag,txt_close_tag)
total_sents = len(sent_detector.tokenize(doc))
#sent_tokens =[tokenize_txt(sent) for sent in sentences]
sent_ner_ratio = defaultdict(int)
for i in range(0,total_sents):
sent_ner_ratio[i] = float(clust_ners[clust][file_name][i])/clust_sent_lens[clust][file_name][i]
doc_sent[file_name] = copy.deepcopy(sent_ner_ratio)
clust_doc[clust] = copy.deepcopy(doc_sent)
return clust_doc
In [42]:
clust_ner_ratio = ner_ratio(data_root_dir,annotation_file,clust_ners,clust_sent_lens)
print 'done'
print clust_ner_ratio['mad cow disease']['LA060490-0083'][11]
Feature 20 : The number of nouns,verbs,adverbs, adjectives in the sentence, divided by the length of the sentence (POS Ratio)
In [43]:
def pos_ratio(data_root_dir,annotation_file,new_clust_tags,clust_sent_lens):
'''Compute the Ratio of nouns,verbs,adverbs and adjectives : Sentence lenght and store them in a dictionary'''
clust_doc = defaultdict(defaultdict)
clusters = clust_sent_lens.keys()
for clust in clusters:
doc_sent = defaultdict(defaultdict)
files = clust_sent_lens[clust].keys()
for _file in files:
sent_ids = clust_sent_lens[clust][_file].keys()
sent_pos_ratio = defaultdict(int)
for sent_id in sent_ids:
pos_cnt = 0
for word,tag_lst in new_clust_tags[clust][_file][sent_id].iteritems():
'''
if _file == 'LA060490-0083' and sent_id == 3:
print tag_lst, pos_cnt*1.0/clust_sent_lens[clust][_file][sent_id]
#print new_clust_tags[clust][_file][sent_id]
'''
if True in tag_lst:
pos_cnt += 1
sent_pos_ratio[sent_id] = float(pos_cnt)/ clust_sent_lens[clust][_file][sent_id]
doc_sent[_file] = copy.deepcopy(sent_pos_ratio)
clust_doc[clust] = copy.deepcopy(doc_sent)
return clust_doc
In [44]:
clust_pos_ratios = pos_ratio(data_root_dir,annotation_file,new_clust_tags,clust_sent_lens)
print 'done'
In [45]:
clust_pos_ratios['mad cow disease']['LA060490-0083'][3]
Out[45]:
Feature 14 : The position of the sentence. Suppose there are M sentences in the document , then for the ith sentence the position is computed as 1-(i-1)/(M-1) (POSITION)
In [46]:
def sentence_pos(data_root_dir,annotation_file,clust_sent_lens):
'''Compute the position of the sentence, according to above formula'''
clust_doc = defaultdict(defaultdict)
clusters = clust_sent_lens.keys()
for clust in clusters:
doc_sent = defaultdict(defaultdict)
files = clust_sent_lens[clust].keys()
for _file in files:
sent_ids = clust_sent_lens[clust][_file].keys()
total_sents = len(clust_sent_lens[clust][_file].keys())
#Avoid divide by 0 error
if total_sents == 1:
total_sents = 2
sent_positon = defaultdict(int)
for sent_id in sent_ids:
sent_positon[sent_id] = 1 - ( float( sent_id ) / (total_sents - 1) )
doc_sent[_file] = copy.deepcopy(sent_positon)
clust_doc[clust] = copy.deepcopy(doc_sent)
return clust_doc
In [47]:
clust_sent_pos = sentence_pos(data_root_dir,annotation_file,clust_sent_lens)
print 'done'
In [48]:
clust_sent_pos['mad cow disease']['LA060490-0083']
Out[48]:
Feature 17 : The mean TF of all words in the sentence, divided by the sentence length (Averaged TF)
In [49]:
def averaged_tf(data_root_dir,annotation_file,clust_word_tfs):
'''Get the average TF values of words in a sentence and them in a dictionary'''
clust_files = get_cluster_and_its_files(data_root_dir,annotation_file)
clust_doc = defaultdict(defaultdict)
for clust,files in clust_files.iteritems():
doc_sent = defaultdict(defaultdict)
for file_name in files:
file_path = data_root_dir + '/' + file_name
doc = get_text_from_doc(file_path,txt_opn_tag,txt_close_tag)
sentences = sent_detector.tokenize(doc)
sent_tokens =[tokenize_txt(sent,nltk_flag=True,ner_flag=True) for sent in sentences]
sent_mean_tf = defaultdict(int)
for s_id,tok_sent in enumerate(sent_tokens):
mean_tf = 0
for word in tok_sent:
mean_tf += clust_word_tfs[clust][word]
mean_tf = float(mean_tf)/len(tok_sent)
sent_mean_tf[s_id] = mean_tf
doc_sent[file_name] = copy.deepcopy(sent_mean_tf)
clust_doc[clust] = copy.deepcopy(doc_sent)
return clust_doc
In [50]:
clust_mean_tfs = averaged_tf(data_root_dir,annotation_file,clust_word_tfs)
print 'done'
In [51]:
clust_mean_tfs['mad cow disease']['LA060490-0083']
Out[51]:
Feature 18 : The mean IDF of all words in the sentence, divided by the sentence length (Averaged IDF)
In [52]:
def averaged_idf(data_root_dir,annotation_file,doc_freqs):
'''Get the average IDF values of words in a sentence and them in a dictionary'''
clust_files = get_cluster_and_its_files(data_root_dir,annotation_file)
clust_doc = defaultdict(defaultdict)
for clust,files in clust_files.iteritems():
doc_sent = defaultdict(defaultdict)
for file_name in files:
file_path = data_root_dir + '/' + file_name
doc = get_text_from_doc(file_path,txt_opn_tag,txt_close_tag)
sentences = sent_detector.tokenize(doc)
sent_tokens =[tokenize_txt(sent,nltk_flag=True,ner_flag=True) for sent in sentences]
sent_mean_idf = defaultdict(int)
for s_id,tok_sent in enumerate(sent_tokens):
mean_idf = 0
for word in tok_sent:
mean_idf += doc_freqs[word]
mean_idf = float(mean_idf)/len(tok_sent)
sent_mean_idf[s_id] = mean_idf
doc_sent[file_name] = copy.deepcopy(sent_mean_idf)
clust_doc[clust] = copy.deepcopy(doc_sent)
return clust_doc
In [53]:
clust_mean_idfs = averaged_idf(data_root_dir,annotation_file,doc_freqs)
print 'done'
In [54]:
clust_mean_idfs['mad cow disease']['LA060490-0083']
Out[54]:
Feature 19 : The mean CF of all words in the sentence, divided by the sentence length (Averaged CF)
In [55]:
def averaged_cf(data_root_dir,annotation_file,clust_dfs):
'''Get the average Cluster freqs values of words in a sentence and them in a dictionary'''
clust_files = get_cluster_and_its_files(data_root_dir,annotation_file)
clust_doc = defaultdict(defaultdict)
for clust,files in clust_files.iteritems():
doc_sent = defaultdict(defaultdict)
for file_name in files:
file_path = data_root_dir + '/' + file_name
doc = get_text_from_doc(file_path,txt_opn_tag,txt_close_tag)
sentences = sent_detector.tokenize(doc)
sent_tokens =[tokenize_txt(sent,nltk_flag=True,ner_flag=True) for sent in sentences]
sent_mean_cf = defaultdict(int)
for s_id,tok_sent in enumerate(sent_tokens):
mean_cf = 0
for word in tok_sent:
mean_cf += clust_dfs[clust][word]
mean_cf = float(mean_cf)/len(tok_sent)
sent_mean_cf[s_id] = mean_cf
doc_sent[file_name] = copy.deepcopy(sent_mean_cf)
clust_doc[clust] = copy.deepcopy(doc_sent)
return clust_doc
In [56]:
clust_mean_cfs = averaged_cf(data_root_dir,annotation_file,clust_dfs)
print 'done'
In [57]:
clust_mean_cfs['mad cow disease']['LA060490-0083']
Out[57]:
In [58]:
def get_rouge_n_score(sent_1,sent_2,n=2,do_stem=True):
'''Normalize the overlapping n-grams and return the score'''
'''Sentences are converted to lower-case and words are stemmed'''
#lower
sent_1 = sent_1.lower()
sent_2 = sent_2.lower()
tokenizer = tokenizers.Tokenizer('english')
sent_1_toks = tokenizer.to_words(sent_1)
sent_2_toks = tokenizer.to_words(sent_2)
#stem the sentence
if do_stem == True:
sent_1 = ' '.join([stem(tok) for tok in sent_1_toks])
sent_2 = ' '.join([stem(tok) for tok in sent_2_toks])
sent_obj_1= dom.Sentence(sent_1,tokenizer)
sent_obj_2= dom.Sentence(sent_2,tokenizer)
return evaluation.rouge_n([sent_obj_1],[sent_obj_2])
In [59]:
print 'ROGUE With stemming: ' , get_rouge_n_score('This iS SentENce CooLing','This is Sentence cool',2,True)
print 'ROGuE Without stemming: ' , get_rouge_n_score('This iS SentENce CooLing','This is Sentence cool',2,False)
In [60]:
def get_docs_without_summary(data_root_dir,annotation_file):
'''Return a dictionary of the form {clust1 : [doc1,doc2...],clust2 : [doc1,doc2...] ....}'''
'''The key is the cluster name, the value is a list of documents, for which summary do not exist'''
'''This is because, certain documents in the DUC dataset do not have a summary. To weed out such documents, this function
will be called.'''
clust_files = get_cluster_and_its_files(data_root_dir,annotation_file)
files_with_summ = set( [fname.lower() for fname in listdir(data_root_dir+ '/' + 'Summaries' + '/')] )
clust_docs_wo_summ = defaultdict(list)
for clust,files in clust_files.iteritems():
for _file in files:
tmp = _file + '.txt'
if tmp.lower() not in files_with_summ:
clust_docs_wo_summ[clust].append(_file)
return clust_docs_wo_summ
In [61]:
docs_without_summ = get_docs_without_summary(data_root_dir,annotation_file)
print docs_without_summ
In [62]:
def extract_gold_summ_from_doc(document_path):
'''Extract the Gold summary of a document.'''
'''Gold summary is of the form <Abstract:> This is the summary <Introduction:>'''
start_tag = 'Abstract:'
close_tag = 'Introduction:'
f = open(document_path,'r')
content = f.read()
f.close()
start = content.index(start_tag) + len(start_tag)
end = content.index(close_tag)
return content[start:end].strip()
In [63]:
doc_path = data_root_dir+ '/' + 'Summaries' + '/' + 'ap880623-0135.txt'
summ = extract_gold_summ_from_doc(doc_path)
print summ
In [64]:
def convert_dict_to_feature_column(clust_files,docs_without_summ):
'''Convert the nested dictionary to a feature column'''
feature_col = []
clusters = sorted(clust_files.keys())
for clust in clusters:
files = sorted(clust_files[clust].keys())
for _file in files:
#Ignore the docs that do not have a summary.
if _file not in docs_without_summ[clust]:
sent_ids = sorted(clust_files[clust][_file].keys())
for sent_id in sent_ids:
feature_col.append(clust_files[clust][_file][sent_id])
return np.array(feature_col)
In [65]:
def construct_X_Matrix(clust_sent_pos,clust_sent_lens,clust_mean_tfs,clust_mean_idfs,clust_mean_cfs,clust_pos_ratios,
clust_ner_ratio,clust_dig_ratio,clust_stop_word_ratio):
'''Construct the X_Matrix by stacking the Features, columnwise, for all sentences. Finally return X_train'''
F_position = convert_dict_to_feature_column(clust_sent_pos,docs_without_summ)
F_length = convert_dict_to_feature_column(clust_sent_lens,docs_without_summ)
F_mean_tfs = convert_dict_to_feature_column(clust_mean_tfs,docs_without_summ)
F_mean_idfs = convert_dict_to_feature_column(clust_mean_idfs,docs_without_summ)
F_mean_cfs = convert_dict_to_feature_column(clust_mean_cfs,docs_without_summ)
F_pos_ratio = convert_dict_to_feature_column(clust_pos_ratios,docs_without_summ)
F_ner_ratio = convert_dict_to_feature_column(clust_ner_ratio,docs_without_summ)
F_dig_ratio = convert_dict_to_feature_column(clust_dig_ratio,docs_without_summ)
F_stop_word_ratio = convert_dict_to_feature_column(clust_stop_word_ratio,docs_without_summ)
stack = (F_position,F_length,F_mean_tfs,F_mean_idfs,F_mean_cfs,F_pos_ratio,F_ner_ratio,F_dig_ratio,F_stop_word_ratio)
return np.column_stack(stack)
In [66]:
def construct_Y(clust_files,docs_without_summ):
'''Construct the Y output value(ROGUE Score) for every sentence in the document, along
with the gold summary of the document. i.e ROGUE(sentence,summary)'''
feature_col = []
clusters = sorted(clust_files.keys())
for clust in clusters:
files = sorted(clust_files[clust])
for _file in files:
#Ignore the docs that do not have a summary.
if _file not in docs_without_summ[clust]:
file_path = data_root_dir + '/' + _file
doc = get_text_from_doc(file_path,txt_opn_tag,txt_close_tag)
sentences = sent_detector.tokenize(doc)
sum_file_path = data_root_dir+ '/' + 'Summaries' + '/' + _file.lower() + '.txt'
gold_summ = extract_gold_summ_from_doc(sum_file_path)
for sent in sentences:
try:
rouge_score = get_rouge_n_score(sent,gold_summ)
except:
rouge_score = 0
#To avoid divide by zero error
feature_col.append(rouge_score)
return np.array(feature_col)
Cross Validate and plot the predicted mean ROGUE square error
The Cost function is $J(\theta)=\frac{1}{m}\sum_{i=1}^m ROGUE\_SCORE(pred_i,actual_i),$
where
$pred_i $ is the predicted ROGUE score of $sentence_i$ and $actual_i $ is the real ROGUE score of $sentence_i$
In [67]:
def do_cross_validation(X_Matrix,Y,clf,n_folds,degree,Model='ridge'):
'''Perform n-fold cross validation.
Params:
X.........a Matrix of features
y.........the true rogue score of each sentence
n_folds...the number of folds of cross-validation to do
Return:
the average testing accuracy across all folds.'''
if Model != 'deep':
poly = PolynomialFeatures(degree)
X_Matrix = poly.fit_transform(X_Matrix)
accuracies = []
cv = KFold(len(Y), n_folds)
for train_idx, test_idx in cv:
clf.fit(X_Matrix[train_idx], Y[train_idx])
predicted = clf.predict(X_Matrix[test_idx])
error = np.mean(np.abs(predicted - Y[test_idx]))
accuracies.append(error)
avg = np.mean(accuracies)
return avg
In [68]:
def plot_accuracies(X_Matrix,Y,clf,n_folds=10,poly_degrees=[1,2,3,4]):
'''Plot a graph of Test Error vs Polynomial Order'''
errors = [do_cross_validation(X_Matrix,Y,clf,n_folds,degree) for degree in poly_degrees]
plt.title('Ridge Regression')
plt.ylabel('Validation Error')
plt.xlabel('Polynomial Degree')
plt.plot(poly_degrees, errors,'r-')
plt.show()
In [69]:
def find_best_order(n_folds,Y,poly_degrees):
'''Experiment with various settings and figure the best polynomial setting'''
X_Matrix = construct_X_Matrix(clust_sent_pos,clust_sent_lens,clust_mean_tfs,clust_mean_idfs,clust_mean_cfs,clust_pos_ratios,
clust_ner_ratio,clust_dig_ratio,clust_stop_word_ratio)
plot_accuracies(X_Matrix,Y,Ridge(),n_folds,poly_degrees)
In [70]:
Y = construct_Y(clust_files,docs_without_summ)
n_folds=10
poly_degrees=[1,2,3]
find_best_order(n_folds,Y,poly_degrees)
Seems the Validation error is minimum when the polynomial order is 2. Raise the X_Matrix to this order and fit the Regressor
In [118]:
def get_best_clf(best_order,Y,clf):
poly = PolynomialFeatures(best_order)
X_Matrix = construct_X_Matrix(clust_sent_pos,clust_sent_lens,clust_mean_tfs,clust_mean_idfs,clust_mean_cfs,clust_pos_ratios,
clust_ner_ratio,clust_dig_ratio,clust_stop_word_ratio)
X_Matrix = poly.fit_transform(X_Matrix)
print X_Matrix
clf.fit(X_Matrix,Y)
print '\nFitted Regressor with best settings'
return clf
In [119]:
clf = get_best_clf(2,Y,Ridge())
In [120]:
def construct_X_Matrix_for_test_doc(cluster,document,clust_sent_pos,clust_sent_lens,clust_mean_tfs,clust_mean_idfs,
clust_mean_cfs,clust_pos_ratios,clust_ner_ratio,clust_dig_ratio,
clust_stop_word_ratio,poly_order):
'''Extract all the features for a given document and return the extracted features'''
X_Matrix = []
for sent_id in clust_sent_pos[cluster][document].keys():
F_position = clust_sent_pos[cluster][document][sent_id]
F_length = clust_sent_lens[cluster][document][sent_id]
F_mean_tfs = clust_mean_tfs[cluster][document][sent_id]
F_mean_idfs = clust_mean_idfs[cluster][document][sent_id]
F_mean_cfs = clust_sent_pos[cluster][document][sent_id]
F_pos_ratio = clust_pos_ratios[cluster][document][sent_id]
F_ner_ratio = clust_ner_ratio[cluster][document][sent_id]
F_dig_ratio = clust_dig_ratio[cluster][document][sent_id]
F_stop_word_ratio = clust_stop_word_ratio[cluster][document][sent_id]
row = [F_position,F_length,F_mean_tfs,F_mean_idfs,F_mean_cfs,
F_pos_ratio,F_ner_ratio,F_dig_ratio,F_stop_word_ratio]
X_Matrix.append(row)
poly = PolynomialFeatures(poly_order)
X_Matrix = poly.fit_transform(np.array(X_Matrix))
return X_Matrix
Greedy Based Sentence Selection.
-are_sentences_salient()
-select_sentences()
In [121]:
def are_sentences_salient(clust,sent_1,sent_2,threshold=0.6):
'''Check if the sentences are salient based on a threshold.
If COSINE_SIM(sent_1,sent_2) < 0.6, return True. Else False'''
sent_1_toks = tokenize_txt(sent_1)
sent_2_toks = tokenize_txt(sent_2)
vocab = list(set(sent_1_toks) | set(sent_2_toks))
vec_1 = []
vec_2 = []
for token in vocab:
tf = clust_word_tfs[clust][token]
idf = doc_freqs[token]
tf_idf = tf*idf
if token in sent_1_toks and token in sent_2_toks:
vec_1.append(tf_idf)
vec_2.append(tf_idf)
elif token in sent_1_toks and token not in sent_2_toks:
vec_1.append(tf_idf)
vec_2.append(0.0)
elif token not in sent_1_toks and token in sent_2_toks:
vec_1.append(0.0)
vec_2.append(tf_idf)
vec_1 = np.array(vec_1).reshape(1,-1)
vec_2 = np.array(vec_2).reshape(1,-1)
sim_score = list(cosine_similarity(vec_1,vec_2)[0])[0]
if sim_score < threshold:
return True
else:
return False
In [122]:
s1 = 'A commission chaired by Professor Sir Richard Southwood '
s2 = 'The government has committed $19 million to finding the cause of the disease Germany'
print are_sentences_salient('mad cow disease',s1 ,s2,threshold=0.6)
s1 = '$19 million disease Germany '
s2 = 'The government has committed $19 million to finding the cause of the disease Germany'
print are_sentences_salient('mad cow disease',s1 ,s2,threshold=0.6)
In [123]:
def doc_to_sent_list(document):
'''Convert a document to a list of sentences'''
file_path = data_root_dir + '/' + document
doc = get_text_from_doc(file_path,txt_opn_tag,txt_close_tag)
sentences = sent_detector.tokenize(doc)
return sentences
In [124]:
def select_sentences(cluster,document,y_hats,sents_in_summ):
'''In each step of the selection,based on the greedy approach, select a sentence if it satisfies 2 conditions:
1. It has the next maximum predicted ROGUE score
2. It is salient and not very similar to the previously generated sentence in the summary'''
top_scores = sorted(y_hats,reverse=True)
prev_sent = ''
sent_id = 0
j = 0
all_sentences = doc_to_sent_list(document)
summary = ''
while(sent_id < sents_in_summ and j < len(top_scores)):
top_sent_idx = y_hats.index(top_scores[j])
cur_sent = all_sentences[top_sent_idx]
if are_sentences_salient(cluster,prev_sent,cur_sent,threshold=0.6):
summary += cur_sent + ' '
prev_sent = cur_sent
sent_id += 1
j += 1
return summary
In [128]:
def generate_summary(cluster,document,clf,order,i,sents_in_summ=2):
'''Generate the summary for a document with sents_in_summ number of sentences in it'''
X_Matrix = construct_X_Matrix_for_test_doc(cluster,document,clust_sent_pos,clust_sent_lens,clust_mean_tfs,
clust_mean_idfs,clust_mean_cfs,clust_pos_ratios,clust_ner_ratio,
clust_dig_ratio,clust_stop_word_ratio,order)
y_hats = list(clf.predict(X_Matrix))
print 'Generated SUMMARY for doc ',i, '::\n-----------------------------------'
summary = select_sentences(cluster,document,y_hats,sents_in_summ)
print str.replace(str.replace(summary,'<P>',''),'</P>','').strip()
print '\n'
print 'Actual SUMMARY for doc ',i, '::\n-----------------------------------'
summary_path = data_root_dir+ '/' + 'Summaries' + '/' + document.lower() + '.txt'
print extract_gold_summ_from_doc(summary_path)
print '\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
'''
print 'COMPLETE TEXT :: \n---------------------'
complete_text = ''.join([sent for sent in sentences])
print str.replace(str.replace(complete_text,'<P>',''),'</P>','').strip()
'''
In [129]:
def random_summaries(prob,sents=1):
'''Generate summaries for approx ( (1-prob) *total_docs) number of documents
total_docs = 300 approx.
prob should be in the range : 0 <= prob <= 1.0 . 0 indicates all documents, 1 indicates none of the documents'''
i = 1
for clust,docs in clust_files.items():
for doc in docs:
if np.random.uniform(low=0.0, high=1.0) > prob and doc not in docs_without_summ[clust] :
generate_summary(clust,doc,clf,2,i,sents)
i += 1
print 'Generation Complete'
In [130]:
random_summaries(0.98,1)
In [131]:
def serialize_data_matrix(poly_deg=2):
'''Helper function to dump the data matrix onto hard disc'''
poly = PolynomialFeatures(poly_deg)
X_Matrix = construct_X_Matrix(clust_sent_pos,clust_sent_lens,clust_mean_tfs,clust_mean_idfs,clust_mean_cfs,clust_pos_ratios,
clust_ner_ratio,clust_dig_ratio,clust_stop_word_ratio)
X_Matrix = poly.fit_transform(X_Matrix)
data = np.column_stack((X_Matrix,Y))
serialize('data_matrix',data)
print 'done'
In [123]:
serialize_data_matrix(2)
Hard Baseline which assumes the very first sentence of the document as the summary
In [138]:
def get_predicted_rouge(cluster,document,clf,order,sents_in_summ=1):
X_Matrix = construct_X_Matrix_for_test_doc(cluster,document,clust_sent_pos,clust_sent_lens,clust_mean_tfs,
clust_mean_idfs,clust_mean_cfs,clust_pos_ratios,clust_ner_ratio,
clust_dig_ratio,clust_stop_word_ratio,order)
y_hats = list(clf.predict(X_Matrix))
pred_summary = select_sentences(cluster,document,y_hats,sents_in_summ)
summary_path = data_root_dir+ '/' + 'Summaries' + '/' + document.lower() + '.txt'
gold_summary = extract_gold_summ_from_doc(summary_path)
try:
rogue = get_rouge_n_score(gold_summary,pred_summary,n=2,do_stem=True)
except:
rogue = 0
return rogue
In [139]:
def evaluate_custom_model(clf,order=2,sents_in_summ=1):
'''Evaluate the Model'''
rouge_lst = []
for clust,docs in clust_files.items():
for doc in docs:
if doc not in docs_without_summ[clust] :
rouge = get_predicted_rouge(clust,doc,clf,order,sents_in_summ)
rouge_lst.append(rouge)
avg = sum(rouge_lst)/len(rouge_lst)
return avg
In [140]:
def evaluate_hard_baseline_2():
'''Blindly assume first sentence as the predicted sentence. Compute the ROGUE_SCORE between
the first sentence and the actual summary.'''
rouge_lst = []
for clust,docs in clust_files.items():
for doc in docs:
if doc not in docs_without_summ[clust] :
first_sentence = doc_to_sent_list(doc)[0]
doc_path = data_root_dir+ '/' + 'Summaries' + '/' + doc.lower() + '.txt'
summary = extract_gold_summ_from_doc(doc_path)
try:
rogue = get_rouge_n_score(first_sentence,summary,n=2,do_stem=True)
except:
rogue = 0
rouge_lst.append(rogue)
avg = sum(rouge_lst)/len(rouge_lst)
return avg
In [141]:
hard_baseline_accuracy = evaluate_hard_baseline_2()
model_accuracy = evaluate_custom_model(clf,order=2,sents_in_summ=1)
print 'First Sentence Model\'s accuracy',hard_baseline_accuracy
print 'Our Model\'s accuracy', model_accuracy
In [142]:
from sklearn.neural_network import MLPRegressor
In [143]:
def get_Xmatrix_and_y(file_name):
summ_data = deserialize(file_name)
X_mat = summ_data[:,0:len(summ_data[0])-1]
y = summ_data[:,len(summ_data[0])-1:]
return X_mat,y
In [144]:
def train_regressor(optimizer,hidden_layer_units,activation_func,epochs):
'''Fit an MLP with the specified settings and return the trained regressor'''
'''Optimizer : Any of lbfgs,sgd etc
hidden_layer_units : A tuple of the form (x,y,z) where x is the no of units in 1st hidden layer,y in 2nd and so on
activation_fun : logistic / tanh / relu
epochs : No of epochs
X : Train Matrix
y : True values
'''
regr = MLPRegressor(solver=optimizer,hidden_layer_sizes=hidden_layer_units,activation=activation_func,max_iter=epochs)
return regr
In [155]:
def run_mlp(optimizers,activation_funcs,epochs,hid_layer_sizes,X_mat,y):
'''Run the MLP for settings specified in the parameters'''
optimizer_act = defaultdict(defaultdict)
for optimizer in optimizers:
act_epoch = defaultdict(defaultdict)
for act_func in activation_funcs:
epoch_hl = defaultdict(defaultdict)
for epoch in epochs:
hl_err = defaultdict(float)
for hid_layer_size in hid_layer_sizes:
regr = train_regressor(optimizer,hid_layer_size,act_func,epoch)
error = do_cross_validation(X_mat,y,regr,10,None,'deep')
hl_err[len(hid_layer_size)] = error
print optimizer,act_func,epoch,len(hid_layer_size),'h_layers complete. Error = ',error
epoch_hl[epoch] = copy.deepcopy(hl_err)
act_epoch[act_func] = copy.deepcopy(epoch_hl)
optimizer_act[optimizer] = copy.deepcopy(act_epoch)
return optimizer_act
In [156]:
def get_best_hyperparams():
'''Specify the various settings for hyperparameters here. Calls run_mlp to get the Validation error'''
X_mat,y = get_Xmatrix_and_y('data_matrix')
optimizers = ['sgd','adam','lbfgs']
activation_funcs = ['tanh','logistic']
epochs = [5,10,15,20]
single_hl = (57,)
double_hl = (57,57)
triple_hl = (57,57,57)
hid_layer_sizes = [single_hl,double_hl,triple_hl]
return run_mlp(optimizers,activation_funcs,epochs,hid_layer_sizes,X_mat,y)
In [157]:
optimizer_act = get_best_hyperparams()
In [158]:
#serialize('MLP_Val_Err.pickle',optimizer_act)
optimizer_act = deserialize('MLP_Val_Err.pickle')
In [159]:
def plot_epoch_vs_errors(optimizer_act):
'''Plot Epoch vs Validation Errors by varying the following :
1. No of Hidden Layers
2. No of Epochs
3. Varying the optimizer
4. Varying the activation function for each optimizer
'''
for optimizer in sorted(optimizer_act.keys()):
for act_func in sorted(optimizer_act[optimizer].keys()):
layer_error = defaultdict(list)
for epoch in sorted(optimizer_act[optimizer][act_func].keys()):
for h_layer in sorted(optimizer_act[optimizer][act_func][epoch].keys()):
layer_error[h_layer].append(optimizer_act[optimizer][act_func][epoch][h_layer])
x = sorted(optimizer_act[optimizer][act_func].keys())
y_lst = []
for lyr in sorted(optimizer_act[optimizer][act_func][epoch].keys()):
y_lst.append(layer_error[lyr])
plt.xlabel('Epochs')
plt.ylabel('Validation Error')
plt.plot(x,y_lst[0],'r-')
plt.plot(x,y_lst[1],'g-')
plt.plot(x,y_lst[2],'b-')
plt.legend(['1 Hidden Layer', '2 Hidden Layers', '3 Hidden Layers'], loc='upper right',columnspacing=0.0, labelspacing=0.0,
)
title = optimizer.upper() + ' with ' + act_func.upper() + ' activation'
plt.title(title)
plt.show()
In [161]:
plot_epoch_vs_errors(optimizer_act)
Optimizer | Activation | Hidden Layers |
---|---|---|
Adam | Logistic | 3 |
LBFGS | Logistic | 3 |
SGD | Logistic | 3 |
In [173]:
def get_model_accuracy():
'''Get the Deep Models Accuracies'''
model_acc = defaultdict(float)
data = 'data_matrix'
#ADAM, Logistic 3
name = 'adam'
model = train_regressor(name,(57,57,57),'logistic',20)
X,y = get_Xmatrix_and_y(data)
model.fit(X,y)
model_acc[name] = evaluate_custom_model(model,order=2,sents_in_summ=1)
#LBFGS, Logistic 3
name = 'lbfgs'
model = train_regressor(name,(57,57,57),'logistic',20)
X,y = get_Xmatrix_and_y(data)
model.fit(X,y)
model_acc[name] = evaluate_custom_model(model,order=2,sents_in_summ=1)
#SGD, Logistic 3
name = 'sgd'
model = train_regressor(name,(57,57,57),'logistic',20)
X,y = get_Xmatrix_and_y(data)
model.fit(X,y)
model_acc[name] = evaluate_custom_model(model,order=2,sents_in_summ=1)
return model_acc
In [6]:
model_acc = get_model_accuracy()
model_acc['first_sent'] = evaluate_hard_baseline_2()
model_acc['ridge'] = evaluate_custom_model(clf,order=2,sents_in_summ=1)
model_acc
Out[6]:
In [9]:
objects = tuple([x.upper() for x in model_acc.keys()])
y_pos = np.arange(len(objects))
performance = model_acc.values()
plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Test Accuracy')
plt.title('Test Accuracies of Different Models')
plt.show()