In [6]:
import json
import csv
import os
# new paper class for holding all inforamtion. generate to json later for storage
class Paper():
def __init__(self, title, abstract, year, authors, b_topic, topics, pid, detc, url):
# Basic info
self.title = title
self.abstract = abstract
self.year = year
self.authors = authors
self.broad_topic = b_topic
self.topics = topics
self.pid = pid
self.detc = detc
self.url = url
# add later
self.index = -1
self.author_ids = []
self.citations = []
self.cited_by = []
self.all_cite = []
In [7]:
# dataBase from team 2014
with open("data/DAC_Entire_DataBase.json", "r") as f:
p_data = json.load(f)
In [8]:
# build papers
papers=[]
count=0
for p in p_data:
newpaper = Paper(p['Title'], p['Abstract'],p['Year'], p['Authors'],p['Broad_Topic'],
p['Topics'], p['PaperID'],p['DETC'], p['URL'])
newpaper.index=str(count)
count+=1
papers.append(newpaper)
print("# of valid nodes: " + str(count))
In [9]:
#get dicitonary url_to_id
url_to_id={}
with open('data/papers.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
url_to_id[row['url']]=row['PaperID']
In [10]:
# citations_links.csv is from team 2014
id_to_citations={}
with open('data/citations_links.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
s=row['sourcePaperId']
t=row['targetPaperId']
if s in id_to_citations:
id_to_citations[s].append(t)
else:
id_to_citations[s]=[]
In [11]:
# get dictionary between id and index
id_to_index={}
index_to_id={}
for p in papers:
id_to_index[url_to_id[p.url]]=p.index
index_to_id[p.index]=id_to_index[url_to_id[p.url]]
In [12]:
# build citations, and check if the citation is valid
for p in papers:
get_id=url_to_id[p.url]
if(get_id in id_to_citations):
new_citations = id_to_citations[get_id]
for c in new_citations:
new_c = id_to_index[c]
if new_c in index_to_id:
p.citations.append(new_c)
p.all_cite.append(new_c)
In [13]:
# build cited by
for p in papers:
for c in p.citations:
papers[int(c)].cited_by.append(p.index)
papers[int(c)].all_cite.append(p.index)
In [14]:
# generate json for later usage
out_pp = []
for p in papers:
# __dict__ specify the format of json
out_pp.append(p.__dict__)
if(os.path.isfile("data/Paper_2014_clean.json")):
os.remove("data/Paper_2014_clean.json")
with open("data/Paper_2014_clean.json", "wb") as f:
json.dump(out_pp, f)
In [15]:
# generate abstract for phrase mining
if(os.path.isfile("data/dac_abstract.txt")):
os.remove("data/dac_abstract.txt")
with open("data/dac_abstract.txt", "w") as text_file:
for p in papers:
# we can give a higer weight for the title
combine=p.abstract
for i in range(3):
combine+=(' ' + p.title)
text_file.write(combine.encode("UTF-8")+"\n")
In [16]:
print(len(papers))
In [ ]: