In [1]:
with open('testfile.txt','r') as f:
testlist = f.readlines()
testlist = map(lambda x:x.strip(),testlist)
print testlist[:10]
In [2]:
'''
This makes each datafile into one tuple of 5 elements,
which contains filename, filetext, query, answer, answer candidates.
'''
def extract(datafile):
filename = datafile[0].strip()
filetext = datafile[2].strip()
query = datafile[4].strip()
answer = datafile[6].strip()
answercand = [ item.strip() for item in datafile[8:] if item != '\n']
return filename,filetext,query,answer,answercand
In [3]:
'''
This makes sentence with '@placeholder'-the blank part- into one with the placeholder filled with answer.
'''
def make_query(sentence_with_placeholder,answer):
lst = sentence_with_placeholder.split()
try:
lst = [ w.replace("@placeholder",answer) for w in lst]
except:
print "no placeholder in the sentence..."
return " ".join(lst)
In [4]:
'''
We used stopwords in natural language toolkit.
'''
import re
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
from nltk.corpus import stopwords
In [5]:
'''
This makes the text of a file into the list of words composing of the text.
'''
def text_to_wordlist(text, remove_stopwords=True):
text = re.sub("[^a-zA-Z]"," ", text)
words = text.lower().split()
if remove_stopwords:
stops = set(stopwords.words('english'))
words = [w for w in words if not w in stops]
return words
In [6]:
def text_to_sentences(text, tokenizer, remove_stopwords = True):
raw_sentences = tokenizer.tokenize(text)
sentences = []
for raw_sentence in raw_sentences:
if len(raw_sentence) > 0:
lst = text_to_wordlist(raw_sentence, remove_stopwords)
joint_sentence = " ".join(lst)
sentences.append(joint_sentence)
return sentences
In [7]:
def clean_text(text):
text = re.sub("[^a-zA-Z]"," ", text)
words = text.lower().split()
stops = set(stopwords.words('english'))
words = [w for w in words if not w in stops]
return " ".join(words)
In [8]:
len(testlist)
Out[8]:
In [9]:
'''
See whether it reads data well.
'''
TEST_PATH = 'test/'
sampleTestFilePath = TEST_PATH+testlist[0]
with open(sampleTestFilePath,'r') as f:
print f.read()
In [10]:
'''
put file paths into a list
'''
testFilePaths = [TEST_PATH + testfileName for testfileName in testlist]
print testFilePaths[3]
print len(testFilePaths)
In [13]:
'''
make a list of tuples containing testsets.
'''
testSets = []
for testPath in testFilePaths:
with open(testPath,'r') as f:
data_tuple = extract(f.readlines())
testSets.append(data_tuple)
print len(testSets)
print testSets[0]
This function was designed to calculate cosine similarities between 'query(answer_cand)' (sentence where @placeholder was replaced by answer_cand) and each sentence in the text of the file. It records similarities and see where it peaks. One query(answer_cand) sentence is used to compute cos_sim for all sentences in the text and the highest similarity score is recorded for that answer_cand. After looping through all possible candidates, this model chooses the highest scoring candidates as the answer. As you can see, this returns predicted answer and its highest similarity calculated.
num_feature means 'the number of words to consider'. consine similarity computes similarity of two sentences using only top N frequent words. You can set number N using this feature.
In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
In [15]:
def predict(testset,num_features=100):
abstract = testset[1]
query = testset[2]
answer = testset[3]
answer_cand = testset[4]
query_list_filled = [ clean_text(make_query(query,cand)) for cand in answer_cand ]
vectorizer = CountVectorizer(analyzer = 'word', tokenizer = None, preprocessor = None, stop_words = None, max_features = num_features)
sentence_list = text_to_sentences(abstract,tokenizer)
sentence_list += query_list_filled
train_data_features =vectorizer.fit_transform(sentence_list)
train_data_features = train_data_features.toarray()
ratio = .0
index = -1
for i in range(-len(query_list_filled),0):
cand_sentence = train_data_features[i].reshape(1,-1)
for j in range(len(sentence_list) - len(query_list_filled)):
abst_sentence = train_data_features[j].reshape(1,-1)
temp = cosine_similarity(cand_sentence,abst_sentence)
if temp > ratio:
ratio = temp
index = i
prediction = answer_cand[index]
similarity = ratio
return prediction,ratio
In [17]:
predict(testSets[1])
Out[17]:
In [23]:
def get_accuracy(someSets,num_features =100):
length = len(someSets)
cnt = 0
cnt_for_print =0
percent =0
num_err =0
for a_set in someSets:
cnt_for_print += 1
answer = a_set[3].lower().strip()
try:
prediction = predict(a_set,num_features)[0].lower().strip()
if answer == prediction:
cnt += 1
except:
num_err += 1
if cnt_for_print % (length/100) == 0:
percent += 1
# print answer
# print prediction
print "%d%% processed..."%(percent)
return cnt * 100.0 / (length-num_err)
In [ ]:
get_accuracy(testSets)
In [12]:
'''
This extract function is for cnn data type, where answer candidates appear anonymized in the text.
Answer candidates are formatted like "@entity123:King" so we need to get rid of ':King' part to calculate cosine similarity.
In this case, use this 'extract' function instead of 'extract' above.
'''
def extract(listdata):
title = listdata[0].strip()
passage = listdata[2].strip()
query = listdata[4].strip()
ans=listdata[6].strip()
cand = listdata[8:]
try:
cand.remove('\n')
except:
pass
cand = map(lambda x: x.strip().split(':')[0] ,cand)
return title,passage,query,ans,cand