In [1]:
## Importing Necessary Packages
from bs4 import BeautifulSoup
import json
import os
import pandas as pd
import re
import requests
import subprocess
In [2]:
## converting Ascii to UTF-8
## For writing text conveniently
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
print sys.getdefaultencoding()
In [3]:
## Checking whether encoding is changed or not
import sys
print sys.getdefaultencoding()
In [4]:
## Generating URL's of each years paper homepage
base_url = "http://papers.nips.cc"
index_conf_year = 2015
index_conf_number = 28
index_url_array = []
for i in range(28):
a = 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-%i-%i' %(index_conf_number,index_conf_year)
index_url_array.append(a)
index_conf_year -= 1
index_conf_number -= 1
## Appending last year URL explicitly due to change in Format
index_url_array.append("https://papers.nips.cc/book/neural-information-processing-systems-1987")
print index_url_array
In [5]:
## Function for converting pdf to text via 'pdftotext' program
## Explicitly need to install 'pdftotext' software on the operating system
def text_from_pdf(pdf_path, temp_path):
if os.path.exists(temp_path):
os.remove(temp_path)
subprocess.call(["pdftotext", pdf_path, temp_path])
f = open(temp_path)
text = f.read()
f.close()
os.remove(temp_path)
return text
In [18]:
## Iteration loop for each year's data from 2015
index_conf_year = 2015
for i,url in enumerate(index_url_array):
index_url = index_url_array[i]
r = requests.get(index_url)
## BeautifulSoup package converts the HTML page into JSON format for convenient readability
soup = BeautifulSoup(r.content)
paper_links = [link for link in soup.find_all('a') if link["href"][:7]=="/paper/"]
## Total Paper found every year
print("%d Papers Found" % len(paper_links))
## Initializing all lists()
nips_authors = list()
papers = list()
paper_authors = list()
authors_all = list()
email_author = list()
email_id = list()
emails = []
tempEmail = []
event_type = []
temp_path = os.path.join("output", "temp.txt")
## Iteration Loop for every year's each pdf or paper
for link in paper_links:
paper_title = link.contents[0]
info_link = base_url + link["href"]
pdf_link = info_link + ".pdf"
pdf_name = link["href"][7:] + ".pdf"
paper_id = re.findall(r"^(\d+)-", pdf_name)[0]
pdf = requests.get(pdf_link)
folder_name = '%i' %index_conf_year
## Saving paper pdf to specified folders
pdf_path = os.path.join("output", "pdfs", folder_name, pdf_name)
pdf_file = open(pdf_path, "wb")
pdf_file.write(pdf.content)
pdf_file.close()
## getting the whole webpage data in JSON format from Beautiful Soup.
paper_soup = BeautifulSoup(requests.get(info_link).content, "lxml")
## Getting abstract from BeautifulSoup.
abstract = paper_soup.find('p', attrs={"class": "abstract"}).contents[0]
## Getting authors from BeautifulSoup.
authors = [(re.findall(r"-(\d+)$", author.contents[0]["href"])[0],
author.contents[0].contents[0])
for author in paper_soup.find_all('li', attrs={"class": "author"})]
## Getting Event_type from BeautifulSoup.
event_types = [h.contents[0][23:] for h in paper_soup.find_all('h3') if h.contents[0][:22]=="Conference Event Type:"]
## sometimes event_type is empty, hence appending 'poster' explicitly if the event is empty
if not event_types:
event_types.append("Poster")
if len(event_types) != 1:
print([h.contents for h in paper_soup.find_all('h3')])
raise Exception("Bad Event Data")
event_type = event_types[0]
paper_text = text_from_pdf(pdf_path, temp_path)
print(paper_title)
## Generating Email address from pdf text
emails = []
# Regular expression for extraction of email ID.
emails = re.findall(r'[\w\.-]+@[\w\.-]+', paper_text)
if not emails:
rx = r'\{(?P<individual>[^{}]+)\}@(?P<domain>\S+)'
for match in re.finditer(rx, paper_text):
email_author = [x.strip() for x in (match.group('individual')).split(',')]
for email in email_author:
tempEmail = email + '@' +match.group('domain')
emails.append(tempEmail)
## appending data to nips_author and paper_author lists
for j,author in enumerate(authors):
try:
nips_authors.append([authors[j][0], paper_id ,authors[j][1], emails[j], index_conf_year])
print nips_authors[j]
paper_authors.append([len(paper_authors)+1, paper_id, author[0]])
except:
pass
## appending data to papers lists
papers.append([paper_id, paper_title, event_type, pdf_name, abstract, paper_text])
## decrementing year
index_conf_year -= 1
## writing the appended lists to csv files with append true option.
pd.DataFrame(nips_authors, columns=["Id","Name","Email","Year","PaperId"]).to_csv("output/Authors.csv", mode='a', header=False, index=False)
pd.DataFrame(papers, columns=["Id", "Title", "EventType", "PdfName", "Abstract", "PaperText"]).to_csv("output/Papers.csv", mode='a', header=False, index=False)
pd.DataFrame(paper_authors, columns=["Id", "PaperId", "AuthorId"]).to_csv("output/PaperAuthors.csv", mode='a', header=False, index=False)
In [8]:
## Generating each year's total papers
index_conf_year = 2015
for i,url in enumerate(index_url_array):
nips_papers = []
index_url = index_url_array[i]
r = requests.get(index_url)
soup = BeautifulSoup(r.content)
paper_links = [link for link in soup.find_all('a') if link["href"][:7]=="/paper/"]
print("%d Papers Found" % len(paper_links))
nips_papers.append([len(paper_links), index_conf_year])
index_conf_year -= 1
pd.DataFrame(nips_papers, columns=["Year","Total_Paper"]).to_csv("output/Years.csv", mode='a', header=False, index=False)
In [1]:
base_url = "http://papers.nips.cc"
index_conf_year = 2015
index_conf_number = 28
index_url_array = []
for i in range(1):
a = 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-%i-%i' %(index_conf_number,index_conf_year)
index_url_array.append(a)
index_conf_year -= 1
index_conf_number -= 1
print index_url_array
In [9]:
## Again using above tweeked code to save just one year papers abstract
for i,url in enumerate(index_url_array):
index_url = index_url_array[i]
r = requests.get(index_url)
## BeautifulSoup package converts the HTML page into JSON format for convenient readability
soup = BeautifulSoup(r.content)
paper_links = [link for link in soup.find_all('a') if link["href"][:7]=="/paper/"]
## Total Paper found every year
print("%d Papers Found" % len(paper_links))
## Initializing all lists()
nips_authors = list()
papers = list()
paper_authors = list()
authors_all = list()
email_author = list()
email_id = list()
emails = []
tempEmail = []
event_type = []
temp_path = os.path.join("output", "temp.txt")
## Iteration Loop for every year's each pdf or paper
for link in paper_links:
paper_title = link.contents[0]
info_link = base_url + link["href"]
pdf_link = info_link + ".pdf"
pdf_name = link["href"][7:] + ".pdf"
paper_id = re.findall(r"^(\d+)-", pdf_name)[0]
pdf = requests.get(pdf_link)
folder_name = '%i' %index_conf_year
## getting the whole webpage data in JSON format from Beautiful Soup.
paper_soup = BeautifulSoup(requests.get(info_link).content, "lxml")
## Getting abstract from BeautifulSoup.
abstract = paper_soup.find('p', attrs={"class": "abstract"}).contents[0]
## Getting authors from BeautifulSoup.
authors = [(re.findall(r"-(\d+)$", author.contents[0]["href"])[0],
author.contents[0].contents[0])
for author in paper_soup.find_all('li', attrs={"class": "author"})]
print(paper_title)
papers.append([paper_id, index_conf_year, paper_title, pdf_name, abstract])
index_conf_year -= 1
## writing the appended lists to csv files with append true option.
pd.DataFrame(papers, columns=["Id", "Year", "Title", "PdfName", "Abstract"]).to_csv("output/Papers_abstract.csv", mode='a', header=False, index=False)
In [ ]: