In [6]:
import json
import csv
import os

# new paper class for holding all inforamtion. generate to json later for storage   
class Paper(): 
    
    def __init__(self, title, abstract, year, authors, b_topic, topics, pid, detc, url):
        
        # Basic info
        self.title = title
        self.abstract = abstract
        self.year = year
        self.authors = authors
        self.broad_topic = b_topic
        self.topics = topics
        self.pid = pid
        self.detc = detc
        self.url = url
        
        # add later 
        self.index = -1
        self.author_ids = []
        self.citations = []
        self.cited_by = []
        self.all_cite = []

In [7]:
# dataBase from team 2014
with open("data/DAC_Entire_DataBase.json", "r") as f:
    p_data = json.load(f)

In [8]:
# build papers
papers=[]
count=0
for p in p_data:
    newpaper = Paper(p['Title'], p['Abstract'],p['Year'], p['Authors'],p['Broad_Topic'], 
                     p['Topics'], p['PaperID'],p['DETC'], p['URL'])
    newpaper.index=str(count)
    count+=1
    papers.append(newpaper)
    
print("# of valid nodes: " + str(count))


# of valid nodes: 1554

In [9]:
#get dicitonary url_to_id
url_to_id={}
with open('data/papers.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        url_to_id[row['url']]=row['PaperID']

In [10]:
# citations_links.csv is from team 2014
id_to_citations={}
with open('data/citations_links.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        s=row['sourcePaperId']
        t=row['targetPaperId']
        if s in id_to_citations:
            id_to_citations[s].append(t)
        else:
            id_to_citations[s]=[]

In [11]:
# get dictionary between id and index
id_to_index={}
index_to_id={}
for p in papers:
    id_to_index[url_to_id[p.url]]=p.index
    index_to_id[p.index]=id_to_index[url_to_id[p.url]]

In [12]:
# build citations, and check if the citation is valid
for p in papers:
    get_id=url_to_id[p.url]
    if(get_id in id_to_citations):
        new_citations = id_to_citations[get_id]
        for c in new_citations:
            new_c = id_to_index[c]
            if new_c in index_to_id:
                p.citations.append(new_c)
                p.all_cite.append(new_c)

In [13]:
# build cited by
for p in papers:
    for c in p.citations:
        papers[int(c)].cited_by.append(p.index)
        papers[int(c)].all_cite.append(p.index)

In [14]:
# generate json for later usage
out_pp = []
for p in papers:
    # __dict__ specify the format of json 
    out_pp.append(p.__dict__)
    
if(os.path.isfile("data/Paper_2014_clean.json")):
    os.remove("data/Paper_2014_clean.json")

with open("data/Paper_2014_clean.json", "wb") as f:
    json.dump(out_pp, f)

In [15]:
# generate abstract for phrase mining

if(os.path.isfile("data/dac_abstract.txt")):
    os.remove("data/dac_abstract.txt")
    
with open("data/dac_abstract.txt", "w") as text_file:
    for p in papers:
        # we can give a higer weight for the title
        combine=p.abstract
        for i in range(3):
            combine+=(' ' + p.title)
        text_file.write(combine.encode("UTF-8")+"\n")

In [16]:
print(len(papers))


1554

In [ ]: