A notebook to get abstracts of all papers.


In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
#config InlineBackend.figure_format = 'pdf'
from IPython.core.display import HTML
import matplotlib.pyplot as plt
from nltk.tokenize import WhitespaceTokenizer
import numpy as np
import os
import pandas as pd
try:
    import cPickle as pickle 
except:
    import pickle

import re
import scipy.stats as stats
import scipy.sparse as sp
import string
import sys
import csv

Get abstracts from the raw text


In [ ]:
def paper_dataframe(fpath):
    rows = []
    with open(fpath, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        # Each read gives ['Id', 'Title', 'EventType', 'PdfName', 'Abstract', 'PaperText']
        reader.next()
        for row in reader:
            rows.append(tuple(row))
    data = pd.DataFrame(rows, columns=['Id', 'Title', 'EventType', 
                                   'PdfName', 'Abstract', 'PaperText'])
    return data

def get_abstract_from_paper_text(text):
    """
    Attempt to extract out the abstract of a paper from a raw text content
    taken out from the pdf. The content should look something like:
    
    ASSOCIATIVE LEARNING
    VIA INHIBITORY SEARCH
    David H. Ackley
    Bell Communications Research
    Cognitive Science Research Group

    ABSTRACT
    ALVIS is a reinforcement-based connectionist architecture that
    learns associative maps in continuous multidimensional environments. The discovered locations of positive and negative reinforcements are recorded in "do be" and "don't be" subnetworks,
    respectively. ...........

    INTRODUCTION
    The "backpropagation algorithm" or generalized delta rule (Rumelhart, Hinton, &
    Williams, 1986) is sometimes cr
    
    The function relies on the keyword "abstract" in its own line as 
    the beginning of the abstract, and the fact that there
    is one empty line after the abstract. 
    
    return the abstract if sucess. Otherwise, return None.
    """
    # locate the line in which there is only the word "abstract"
    lines = text.split('\n')
    abs_line_start = -1
    abs_line_end = -1
    # linear search
    for i, L in enumerate(lines):
        if L.strip().lower() == 'abstract':
            # The following line to an empty line is the abstract
            abs_line_start = i+1
            break
    
    if abs_line_start == -1:
        return None
    
    for i in xrange(abs_line_start+2, len(lines)):
        L = lines[i]
        if L.strip() == '':
            abs_line_end = i
            break
    
    if abs_line_end == -1:
        return None
    
    abstract = ' '.join(lines[abs_line_start:abs_line_end])
    return abstract.strip()

In [ ]:
dframe = paper_dataframe('Papers1988_2015.csv')

In [ ]:
# 30, 90, 171
ind = 171

a_paper = dframe['PaperText'][ind]
print a_paper[:1800]

In [ ]:
print get_abstract_from_paper_text(a_paper)

In [ ]:
# get the abstract for each paper
n_docs = dframe.shape[0]
abstracts = []

for i in xrange(n_docs):
    paper_i = dframe['PaperText'][i]
    abstract_i = get_abstract_from_paper_text(paper_i)
    abstracts.append(abstract_i)

In [ ]:
# papers that do not have an abstract
print [i for (i, ab) in enumerate(abstracts) if ab is None]

In [ ]:
# load the pickle containing the document-term matrix, 
# put the abstracts in, and dump it to a file.
fyear = 1988
tyear = 2015
dt_fpath = 'DT_%d_%d.p'%(fyear, tyear)

with open(dt_fpath, 'r') as f:
    info = pickle.load(f)

# include the abstracts
info['abstracts'] = abstracts
# save the pickle
dt_dest = 'DT_%d_%d_wabs.p'%(fyear, tyear)
with open(dt_dest, 'w') as f:
    pickle.dump(info, f)

In [ ]:


In [ ]:



In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: