In [33]:
import sys
default_stdout = sys.stdout
default_stderr = sys.stderr
reload(sys)
sys.stdout = default_stdout
sys.stderr = default_stderr
sys.setdefaultencoding('utf-8')
In [66]:
def get_status(div):
article_status_tag = div.find('a', attrs = {"class":"item_status"})
try:
return article_status_tag.text
except:
return ""
def get_text(div):
spaced_text = " ".join(item.strip() for item in div.findAll(text=True))
return spaced_text
def clean_text(title_text):
no_newline_text = title_text.replace("\n", "")
reduced_spaced_text = " ".join(no_newline_text.split())
return reduced_spaced_text
def get_desc(div):
full_text = get_text(div)
article_status = get_status(div)
index = full_text.find(article_status)
start_index = index + len(article_status)
end_index = full_text.find('SFXButton')
return clean_text(full_text[start_index: end_index])
In [80]:
def get_article_year(div):
full_text = get_text(div)
for year in range(2000, 2017):
if str(year) in get_desc(div):
return year
return -1
def get_article_date(div):
return get_desc(div)
def get_article_id(div):
num_tag = div.find('a', attrs = {"class":"mrnum"})
return num_tag.text[2:]
def get_article_url(div):
prefix = 'http://www.ams.org/mathscinet/search/publdoc.html?pg1=MR&s1='
article_id = get_article_id(div)
return prefix + article_id
def get_article_desc(div):
return get_desc(div)
In [81]:
def find_papers(mid):
if "MR" in mid:
mid = mid[2:]
prefix = "http://www.ams.org/mathscinet/search/publications.html?pg1=INDI&s1="
suffix = "&vfpref=html&r=1&extend=1"
url = prefix + mid + suffix
page = urllib2.urlopen(url).read()
soup = BeautifulSoup.BeautifulSoup(page)
divList = soup.findAll('div', attrs={ "class" : "headlineText"})
info_list = []
for div in divList:
info = {}
info['id'] = get_article_id(div)
info['url'] = get_article_url(div)
info['description'] = get_article_desc(div)
info['date'] = get_article_year(div)
info_list.append(info)
return info_list
In [82]:
import scrapy
import requests
import urllib2
import BeautifulSoup
import pickle
import pyprind
In [83]:
url='http://www.ams.org/mathscinet/search/publications.html?pg1=INDI&s1=667103'
In [ ]:
{
"date":"2013",
"description":"Andersen, J. E.; Penner, R. C.; Reidys, C. M.; Waterman, M. S. Topological classification and enumeration of RNA structures by genus. J. Math. Biol. 67 (2013), no. 5, 1261\u20131278.",
"publication":"journal",
"id":"1",
"collaborator_ids":[
1,
99
]
},
In [84]:
find_papers('153290')
In [ ]: