In [21]:
import json
import re
import os
#load phrase mining result (First half of TopMine)
lines=[]
with open("data/corpus.txt") as f:
data = f.readlines()
for line in data:
lines.append(line)
In [22]:
# eliminate double space
def process_double_space(phrase):
new_phrase=re.split(' ', phrase)
if('' in new_phrase):
new_phrase.remove('')
new_phrase = ' '.join(new_phrase)
return new_phrase
In [23]:
# build a ditionary for frequency
phrase_freq={}
phrase_list=[]
phrase_index={}
count=0
for line in lines:
phrases=re.split('\n|,',line.replace('-',' '))
phrases.remove('')
sub_list=[]
for phrase in phrases:
new_phrase=process_double_space(phrase)
if not new_phrase in phrase_index:
phrase_index[new_phrase]=count
count+=1
if new_phrase in phrase_freq:
phrase_freq[new_phrase]+=1
else:
phrase_freq[new_phrase]=1
sub_list.append(new_phrase)
phrase_list.append(sub_list)
In [24]:
# check size of dictionary
print(len(phrase_freq))
print(len(phrase_index))
In [25]:
#function for break down long phrase
def regroup_phrase(phrase):
group=[]
new_phrase=re.split(' ', phrase)
max_size=len(new_phrase)
for i in range(0, max_size):
for j in range(0, len(new_phrase)-i):
new_string=new_phrase[j]
for k in range(i):
new_string+=' '
new_string+=new_phrase[j+k+1]
group.append(new_string)
return group
regroup_phrase('a b c')
Out[25]:
In [26]:
#break long phrase into more sub phrases, as long as sub phrases is in the original dictionary
import math
import numpy
super_phrase_list=[]
super_phrase_freq={}
phrase_docfreq={}
phrase_idf=numpy.zeros(len(phrase_index))
for line in phrase_list:
temp=set()
sub_list=[]
for phrase in line:
group=regroup_phrase(phrase)
for i in group:
if i in phrase_index:
temp.add(i)
sub_list.append(i)
if(i in super_phrase_freq):
super_phrase_freq[i]+=1
else:
super_phrase_freq[i]=1
for i in temp:
if i in phrase_docfreq:
phrase_docfreq[i]+=1
else:
phrase_docfreq[i]=1
super_phrase_list.append(sub_list)
#calculate Idf
for key in phrase_docfreq:
count=phrase_docfreq[key]
phrase_idf[phrase_index[key]]=math.log(len(super_phrase_list)/float(count),2)
#check size of dicitonary and size of document
print(len(super_phrase_freq))
print(len(phrase_docfreq))
print(len(super_phrase_list))
In [27]:
#generate Tf-Idf for all papers
texts_vec=[]
for line in super_phrase_list:
vec=numpy.zeros(len(phrase_index))
#accumulate local count
for p in line:
vec[phrase_index[p]]+=1
#calculate tf
vec/=float(len(line))
#get tf-idf by multiplying idf
vec*=phrase_idf
texts_vec.append(vec)
In [28]:
#similarity function = cosine distance
from numpy import linalg as la
def get_similarity(a,b):
return numpy.dot(a,b)/(la.norm(a)*la.norm(b))
print get_similarity(texts_vec[0],texts_vec[8])
print get_similarity(texts_vec[0],texts_vec[0])
In [29]:
#add similarity information into json
with open("data/Paper_2014_clean.json", "r") as f:
p_data = json.load(f)
In [30]:
# make sure the index of json match the index of our result
print p_data[1553]['abstract']
print super_phrase_list[1553]
In [31]:
# get the similarity score for citaitons and add it into json
for p in p_data:
idx=int(p['index'])
cite_sim=[]
cite_by_sim=[]
all_cite_sim=[]
for i in p['citations']:
s=get_similarity(texts_vec[idx],texts_vec[int(i)])
cite_sim.append(s)
all_cite_sim.append(s)
for i in p['cited_by']:
s=get_similarity(texts_vec[idx],texts_vec[int(i)])
cite_by_sim.append(s)
all_cite_sim.append(s)
#json doesn't support numpy array
#put all similarity score into json
p['citations_sim']=cite_sim
p['cited_by_sim']=cite_by_sim
p['all_cite_sim']=all_cite_sim
In [32]:
#sanity check
p=p_data[1]
print p_data[803]['abstract']
print p_data[803]['broad_topic']
print ' '
print p_data[108]['abstract']
print p_data[108]['broad_topic']
print ' '
print p['abstract']
print p['broad_topic']
print ' '
print p['citations']
print p['citations_sim']
In [33]:
# get phrase features for classification step
for p in p_data:
i=int(p['index'])
vec=texts_vec[i]
z=sum(vec)
vec/=z
vec*=len(super_phrase_list[i])
word_bag={}
for j in super_phrase_list[i]:
word_bag[str(phrase_index[j])]=vec[phrase_index[j]]
p['phrases']=word_bag
p['phrases_size']=len(super_phrase_list[i])
index_phrase={}
for key in phrase_index:
index_phrase[phrase_index[key]]=key
In [34]:
# sanity check
from pprint import pprint
print index_phrase[1]
pprint (p_data[0]['phrases'])
print p_data[0]['phrases_size']
print (p_data[0]['abstract'])
In [35]:
super_data={}
super_data['papers']=p_data
super_data['index_phrase']=index_phrase
path="data/super_data.json"
if(os.path.isfile(path)):
os.remove(path)
with open(path, "w") as f:
json.dump(super_data, f)
In [ ]: