In [1]:
## Importing Necessary Packages
from bs4 import BeautifulSoup
import json
import os
import pandas as pd
import re
import requests
import subprocess

In [2]:
## converting Ascii to UTF-8
## For writing text conveniently
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
print sys.getdefaultencoding()

In [3]:
## Checking whether encoding is changed or not
import sys
print sys.getdefaultencoding()

In [4]:
## Generating URL's of each years paper homepage
base_url  = "http://papers.nips.cc"
index_conf_year = 2015
index_conf_number = 28
index_url_array = []
for i in range(28):
    a = 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-%i-%i' %(index_conf_number,index_conf_year)
    index_url_array.append(a)
    index_conf_year -= 1
    index_conf_number -= 1
## Appending last year URL explicitly due to change in Format
index_url_array.append("https://papers.nips.cc/book/neural-information-processing-systems-1987")    
print index_url_array


['http://papers.nips.cc/book/advances-in-neural-information-processing-systems-28-2015', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-27-2014', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-26-2013', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-25-2012', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-24-2011', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-23-2010', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-22-2009', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-21-2008', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-20-2007', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-19-2006', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-18-2005', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-17-2004', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-16-2003', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-15-2002', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-14-2001', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-13-2000', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-12-1999', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-11-1998', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-10-1997', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-9-1996', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-8-1995', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-7-1994', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-6-1993', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-5-1992', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-4-1991', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-3-1990', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-2-1989', 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-1-1988', 'https://papers.nips.cc/book/neural-information-processing-systems-1987']

In [5]:
## Function for converting pdf to text via 'pdftotext' program
## Explicitly need to install 'pdftotext' software on the operating system
def text_from_pdf(pdf_path, temp_path):
    if os.path.exists(temp_path):
        os.remove(temp_path)
    subprocess.call(["pdftotext", pdf_path, temp_path])
    f = open(temp_path)
    text = f.read()
    f.close()
    os.remove(temp_path)
    return text

In [18]:
## Iteration loop for each year's data from 2015
index_conf_year = 2015
for i,url in enumerate(index_url_array):
    index_url = index_url_array[i]
    r = requests.get(index_url)
    ## BeautifulSoup package converts the HTML page into JSON format for convenient readability
    soup = BeautifulSoup(r.content)
    paper_links = [link for link in soup.find_all('a') if link["href"][:7]=="/paper/"]
    ## Total Paper found every year
    print("%d Papers Found" % len(paper_links))
    ## Initializing all lists()
    nips_authors = list()
    papers = list()
    paper_authors = list()
    authors_all = list()
    email_author =  list()
    email_id = list()
    emails = []
    tempEmail = []
    event_type = []
    
    temp_path = os.path.join("output", "temp.txt")
    ## Iteration Loop for every year's each pdf or paper
    for link in paper_links:
        paper_title = link.contents[0]
        info_link = base_url + link["href"]
        pdf_link = info_link + ".pdf"
        pdf_name = link["href"][7:] + ".pdf"
        paper_id = re.findall(r"^(\d+)-", pdf_name)[0]
        pdf = requests.get(pdf_link)
        folder_name = '%i' %index_conf_year
        ## Saving paper pdf to specified folders
        pdf_path = os.path.join("output", "pdfs", folder_name, pdf_name)
        pdf_file = open(pdf_path, "wb")
        pdf_file.write(pdf.content)
        pdf_file.close()
        ## getting the whole webpage data in JSON format from Beautiful Soup.
        paper_soup = BeautifulSoup(requests.get(info_link).content, "lxml")
        ## Getting abstract from BeautifulSoup.
        abstract = paper_soup.find('p', attrs={"class": "abstract"}).contents[0]
        ## Getting authors from BeautifulSoup.
        authors = [(re.findall(r"-(\d+)$", author.contents[0]["href"])[0],
                    author.contents[0].contents[0])
                   for author in paper_soup.find_all('li', attrs={"class": "author"})]
        
        ## Getting Event_type from BeautifulSoup.
        event_types = [h.contents[0][23:] for h in paper_soup.find_all('h3') if h.contents[0][:22]=="Conference Event Type:"]
        ## sometimes event_type is empty, hence appending 'poster' explicitly if the event is empty
        if not event_types:
                event_types.append("Poster")
        if len(event_types) != 1:
            print([h.contents for h in paper_soup.find_all('h3')])
            raise Exception("Bad Event Data")    
        event_type = event_types[0]
        
        paper_text = text_from_pdf(pdf_path, temp_path)
        print(paper_title)
        
        ## Generating Email address from pdf text
        emails = []  
        # Regular expression for extraction of email ID.
        emails = re.findall(r'[\w\.-]+@[\w\.-]+', paper_text)
        if not emails:
            rx = r'\{(?P<individual>[^{}]+)\}@(?P<domain>\S+)'
            for match in re.finditer(rx, paper_text):
                email_author = [x.strip() for x in (match.group('individual')).split(',')]
            for email in email_author:
                tempEmail = email + '@' +match.group('domain')
                emails.append(tempEmail)

        ## appending data to nips_author and paper_author lists
        for j,author in enumerate(authors):
            
            try:
                nips_authors.append([authors[j][0], paper_id ,authors[j][1], emails[j], index_conf_year])
                print nips_authors[j]
                paper_authors.append([len(paper_authors)+1, paper_id, author[0]])
            except:
                pass
        ## appending data to papers lists
        papers.append([paper_id, paper_title, event_type, pdf_name, abstract, paper_text])
    ## decrementing year
    index_conf_year -= 1
    ## writing the appended lists to csv files with append true option.
    pd.DataFrame(nips_authors, columns=["Id","Name","Email","Year","PaperId"]).to_csv("output/Authors.csv", mode='a', header=False, index=False)
    pd.DataFrame(papers, columns=["Id", "Title", "EventType", "PdfName", "Abstract", "PaperText"]).to_csv("output/Papers.csv", mode='a', header=False, index=False)
    pd.DataFrame(paper_authors, columns=["Id", "PaperId", "AuthorId"]).to_csv("output/PaperAuthors.csv", mode='a', header=False, index=False)

In [8]:
## Generating each year's total papers
index_conf_year = 2015
for i,url in enumerate(index_url_array):
    nips_papers = []
    index_url = index_url_array[i]
    r = requests.get(index_url)
    soup = BeautifulSoup(r.content)
    paper_links = [link for link in soup.find_all('a') if link["href"][:7]=="/paper/"]
    print("%d Papers Found" % len(paper_links))
    nips_papers.append([len(paper_links), index_conf_year])
    index_conf_year -= 1
    pd.DataFrame(nips_papers, columns=["Year","Total_Paper"]).to_csv("output/Years.csv", mode='a', header=False, index=False)


403 Papers Found
411 Papers Found
360 Papers Found
368 Papers Found
306 Papers Found
292 Papers Found
262 Papers Found
250 Papers Found
217 Papers Found
204 Papers Found
207 Papers Found
207 Papers Found
198 Papers Found
207 Papers Found
197 Papers Found
152 Papers Found
150 Papers Found
151 Papers Found
150 Papers Found
152 Papers Found
152 Papers Found
140 Papers Found
158 Papers Found
127 Papers Found
144 Papers Found
143 Papers Found
101 Papers Found
94 Papers Found
90 Papers Found

In [1]:
base_url  = "http://papers.nips.cc"
index_conf_year = 2015
index_conf_number = 28
index_url_array = []
for i in range(1):
    a = 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-%i-%i' %(index_conf_number,index_conf_year)
    index_url_array.append(a)
    index_conf_year -= 1
    index_conf_number -= 1
print index_url_array


['http://papers.nips.cc/book/advances-in-neural-information-processing-systems-28-2015']

In [9]:
## Again using above tweeked code to save just one year papers abstract
for i,url in enumerate(index_url_array):
    index_url = index_url_array[i]
    r = requests.get(index_url)
    ## BeautifulSoup package converts the HTML page into JSON format for convenient readability
    soup = BeautifulSoup(r.content)
    paper_links = [link for link in soup.find_all('a') if link["href"][:7]=="/paper/"]
    ## Total Paper found every year
    print("%d Papers Found" % len(paper_links))
    ## Initializing all lists()
    nips_authors = list()
    papers = list()
    paper_authors = list()
    authors_all = list()
    email_author =  list()
    email_id = list()
    emails = []
    tempEmail = []
    event_type = []
    
    temp_path = os.path.join("output", "temp.txt")
    ## Iteration Loop for every year's each pdf or paper
    for link in paper_links:
        paper_title = link.contents[0]
        info_link = base_url + link["href"]
        pdf_link = info_link + ".pdf"
        pdf_name = link["href"][7:] + ".pdf"
        paper_id = re.findall(r"^(\d+)-", pdf_name)[0]
        pdf = requests.get(pdf_link)
        folder_name = '%i' %index_conf_year
        ## getting the whole webpage data in JSON format from Beautiful Soup.
        paper_soup = BeautifulSoup(requests.get(info_link).content, "lxml")
        ## Getting abstract from BeautifulSoup.
        abstract = paper_soup.find('p', attrs={"class": "abstract"}).contents[0]
        ## Getting authors from BeautifulSoup.
        authors = [(re.findall(r"-(\d+)$", author.contents[0]["href"])[0],
                    author.contents[0].contents[0])
                   for author in paper_soup.find_all('li', attrs={"class": "author"})]
           
        print(paper_title)

        papers.append([paper_id, index_conf_year, paper_title, pdf_name, abstract])
    index_conf_year -= 1
    ## writing the appended lists to csv files with append true option.
    pd.DataFrame(papers, columns=["Id", "Year", "Title", "PdfName", "Abstract"]).to_csv("output/Papers_abstract.csv", mode='a', header=False, index=False)

In [ ]: