A notebook to get abstracts of all papers.
In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
#config InlineBackend.figure_format = 'pdf'
from IPython.core.display import HTML
import matplotlib.pyplot as plt
from nltk.tokenize import WhitespaceTokenizer
import numpy as np
import os
import pandas as pd
try:
import cPickle as pickle
except:
import pickle
import re
import scipy.stats as stats
import scipy.sparse as sp
import string
import sys
import csv
In [ ]:
def paper_dataframe(fpath):
rows = []
with open(fpath, 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
# Each read gives ['Id', 'Title', 'EventType', 'PdfName', 'Abstract', 'PaperText']
reader.next()
for row in reader:
rows.append(tuple(row))
data = pd.DataFrame(rows, columns=['Id', 'Title', 'EventType',
'PdfName', 'Abstract', 'PaperText'])
return data
def get_abstract_from_paper_text(text):
"""
Attempt to extract out the abstract of a paper from a raw text content
taken out from the pdf. The content should look something like:
ASSOCIATIVE LEARNING
VIA INHIBITORY SEARCH
David H. Ackley
Bell Communications Research
Cognitive Science Research Group
ABSTRACT
ALVIS is a reinforcement-based connectionist architecture that
learns associative maps in continuous multidimensional environments. The discovered locations of positive and negative reinforcements are recorded in "do be" and "don't be" subnetworks,
respectively. ...........
INTRODUCTION
The "backpropagation algorithm" or generalized delta rule (Rumelhart, Hinton, &
Williams, 1986) is sometimes cr
The function relies on the keyword "abstract" in its own line as
the beginning of the abstract, and the fact that there
is one empty line after the abstract.
return the abstract if sucess. Otherwise, return None.
"""
# locate the line in which there is only the word "abstract"
lines = text.split('\n')
abs_line_start = -1
abs_line_end = -1
# linear search
for i, L in enumerate(lines):
if L.strip().lower() == 'abstract':
# The following line to an empty line is the abstract
abs_line_start = i+1
break
if abs_line_start == -1:
return None
for i in xrange(abs_line_start+2, len(lines)):
L = lines[i]
if L.strip() == '':
abs_line_end = i
break
if abs_line_end == -1:
return None
abstract = ' '.join(lines[abs_line_start:abs_line_end])
return abstract.strip()
In [ ]:
dframe = paper_dataframe('Papers1988_2015.csv')
In [ ]:
# 30, 90, 171
ind = 171
a_paper = dframe['PaperText'][ind]
print a_paper[:1800]
In [ ]:
print get_abstract_from_paper_text(a_paper)
In [ ]:
# get the abstract for each paper
n_docs = dframe.shape[0]
abstracts = []
for i in xrange(n_docs):
paper_i = dframe['PaperText'][i]
abstract_i = get_abstract_from_paper_text(paper_i)
abstracts.append(abstract_i)
In [ ]:
# papers that do not have an abstract
print [i for (i, ab) in enumerate(abstracts) if ab is None]
In [ ]:
# load the pickle containing the document-term matrix,
# put the abstracts in, and dump it to a file.
fyear = 1988
tyear = 2015
dt_fpath = 'DT_%d_%d.p'%(fyear, tyear)
with open(dt_fpath, 'r') as f:
info = pickle.load(f)
# include the abstracts
info['abstracts'] = abstracts
# save the pickle
dt_dest = 'DT_%d_%d_wabs.p'%(fyear, tyear)
with open(dt_dest, 'w') as f:
pickle.dump(info, f)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: