In [3]:
record1 = {
'publisher':'Wiley',
'pub_website':'onlinelibrary.wiley.com',
'x_title':'string(//*[@class="article-header__title"])',
'x_abstract':'string(//*[@id="abstract"]/div/p)',
'x_people':'//*[@class="article-header__authors-item"]',
'x_depts':'//*[@class="article-header__authors-item"]',
'x_person':'string(.//*[@class="article-header__authors-name"])',
'x_dept':'string(.//*[@class="article-header__authors-item-aff-addr"])',
'x_date':'string(//time[@id="first-published-date"])',
'date_con':'''
date = datetime.strptime(date,'%d %B %Y')''',
'name_con':'''
pex2 = []
for p in pex:
name = ""
p = re.sub(r"\s+", u" ", p, flags=re.UNICODE)
p = re.sub(',',u'',p)
words = p.split(" ")
if u'Dr.' in words:
words.remove(u'Dr.')
for w in words[:-1]:
if not(w==u''):
name+=w[0]
name+=u" "+words[-1]
pex2.append(name)
pex=pex2'''
}
record2 = {
'publisher':'Wiley',
'pub_website':'onlinelibrary.wiley.com',
'x_title':'string(//*[@class="articleTitle"])',
'x_abstract':'string(//*[@id="abstract"]/div[@class="para"])',
'x_people':'//*[@id="authors"]/li',
'x_depts':'//*[@id="authorsAffiliations"]/li',
'x_person':'text()',
'x_dept':'p/text()',
'x_date':'string(//p[@id="publishedOnlineDate"])',
'date_con':'''
date = datetime.strptime(u' '.join(date.split(' ')[-3:]),'%d %b %Y')
''',
'name_con':'''
pex2 = []
pexl=[]
for p in pex:
if type(p)==list:
pexl+=p
else:
pexl.append(p)
for p in pexl:
name = ""
p = re.sub(r"\s+", u" ", p, flags=re.UNICODE)
p = re.sub(',',u'',p)
p = re.sub(' and','',p )
words = p.split(" ")
if u'Dr.' in words:
words.remove(u'Dr.')
for w in words[:-1]:
if not(w==u''):
name+=w[0]
name+=u" "+words[-1]
pex2.append(name)
pex=filter(lambda x: x!=u' ',pex2)
'''
}
record3 = {
'publisher':'ACS',
'pub_website':'pubs.acs.org',
'x_title':'string(//*[@class="articleTitle"])',
'x_abstract':'string(//*[@id="abstractBox"])',
'x_people':'//*[@id="authors"]/span',
'x_person':'string(span[1])',
'x_depts':'//*[@class="affiliations"]/div',
'x_dept':'string(.)',
'x_date':'string(//*[@id="pubDate"])',
'date_con':'''
sp = date.split(' ')
date = datetime.strptime(' '.join([sp[-3]]+[sp[-2][:-1]]+[sp[-1]]),'%B %d %Y')''',
'name_con':'''
pex2 = []
for p in pex:
name = ""
p = re.sub(r"\s+", u" ", p, flags=re.UNICODE)
p = re.sub(',',u'',p)
words = p.split(" ")
if words[0] in [u'Dr.']:
words.remove(words[0])
for w in words[:-1]:
name+=w[0]
name+=u" "+words[-1]
pex2.append(name)
pex=pex2'''
}
record4 = { 'publisher':'American Physical Society',
'pub_website':'journals.aps.org',
'x_title':'//div[@id="title"]/descendant::h3/text()',
'x_abstract':'//meta[@name="description"]/@content',
'x_people':'//section[@class="article authors open"]/div/p',
'x_person':'*[1]/text()',
'x_depts':'//section[@class="article authors open"]/div/ul[@class="no-bullet"]',
'x_dept':'li/text()',
'x_date':'//ul[@class="inline-list pub-dates"]/li/text()',
'date_con':'''
sp = date.split(' ')
date = datetime.strptime(' '.join(sp[1:]),'%d %B %Y')''',
'name_con':'''
pex2 = []
for p in pex:
name = ""
p = re.sub(r"\s+", u" ", p, flags=re.UNICODE)
p = re.sub(',',u'',p)
words = p.split(" ")
if words[0] in [u'Dr.']:
words.remove(words[0])
for w in words[:-1]:
name+=w[0]
name+=u" "+words[-1]
pex2.append(name)
pex=pex2'''
}
record5 = { 'publisher':'Springer',
'pub_website':'link.springer.com',
'x_title':'//*[@class="ArticleTitle"]/text()',
'x_abstract':'string(//*[@id="Abs1"]/p)',
'x_people':'//ul[@class="AuthorNames"]/li',
'x_person':'*/span[@class="AuthorName"]/text()',
'x_depts':'//ul[@class="AuthorNames"]/li',
'x_dept':'descendant::span[@class="AuthorsName_affiliation"]/span/text()',
'x_date':'//*[@class="ArticleCitation_Year"]/time/text()',
'date_con':'''
sp = date.split(' ')
date = datetime.strptime(date,'%d %B %Y')''',
'name_con':'''
pex2 = []
for p in pex:
name = ""
p = re.sub(r"\s+", u" ", p, flags=re.UNICODE)
p = re.sub(',',u'',p)
words = p.split(" ")
if words[0] in [u'Dr.']:
words.remove(words[0])
for w in words[:-1]:
name+=w[0]
name+=u" "+words[-1]
pex2.append(name)
pex=pex2'''
}
record6 = { 'publisher':'Springer',
'pub_website':'link.springer.com',
'x_title':'//*[@class="ArticleTitle"]/text()',
'x_abstract':'string(//*[@id="Abs1"]/p)',
'x_people':'//ul[@class="AuthorNames"]/li',
'x_person':'*/span[@class="AuthorName"]/text()',
'x_depts':'//ul[@class="AuthorNames"]/li',
'x_dept':'descendant::span[@class="AuthorsName_affiliation"]/span/text()',
'x_date':'//*[@class="ArticleCitation_Year"]/time/text()',
'date_con':'''
sp = date.split(' ')
date = datetime.strptime(date,'%B %Y')''',
'name_con':'''
pex2 = []
for p in pex:
name = ""
p = re.sub(r"\s+", u" ", p, flags=re.UNICODE)
p = re.sub(',',u'',p)
words = p.split(" ")
if words[0] in [u'Dr.']:
words.remove(words[0])
for w in words[:-1]:
name+=w[0]
name+=u" "+words[-1]
pex2.append(name)
pex=pex2'''
}
record7 = { 'publisher':'ScienceDirect',
'pub_website':'www.sciencedirect.com',
'x_title':'//*[@class="svTitle"]/text()',
'x_abstract':'string(//*[@class="abstract svAbstract "])',
'x_people':'//ul[@class="authorGroup noCollab svAuthor"]/li',
'x_person':'*[@class="authorName svAuthor"]/text()',
'x_depts':'//ul[@class="affiliation authAffil"]/li',
'x_dept':'span/text()',
'x_date':'//dl[@class="articleDates"]/dd/text()',
'date_con':'''
sp = date.split(' ')
date = datetime.strptime(" ".join(sp[-3:]),'%d %B %Y')''',
'name_con':'''
pex2 = []
for p in pex:
name = ""
p = re.sub(r"\s+", u" ", p, flags=re.UNICODE)
p = re.sub(',',u'',p)
words = p.split(" ")
if words[0] in [u'Dr.']:
words.remove(words[0])
for w in words[:-1]:
name+=w[0]
name+=u" "+words[-1]
pex2.append(name)
pex=pex2'''
}
record8 = { 'publisher':'International Union of Crystallography',
'pub_website':'scripts.iucr.org',
'x_title':'string(//*[@class="ica_title"])',
'x_abstract':'string(//*[@class="ica_abstract"])',
'x_people':'//*[@class="ica_authors"]/a',
'x_person':'string(.)',
'x_depts':'//none-here',#no affiliations given on page
'x_dept':'//none-here',
'x_date':'//div[@class="ica_header"]/span[2]/text()',
'date_con':'''
date = datetime.strptime(date[2:6],'%Y')''',
'name_con':'''
pex2 = []
for p in pex:
name = ""
p = re.sub(r"\s+", u" ", p, flags=re.UNICODE)
p = re.sub(',',u'',p)
words = p.split(" ")
if words[0] in [u'Dr.']:
words.remove(words[0])
for w in words[:-1]:
name+=w[0]
name+=u" "+words[-1]
pex2.append(name)
pex=pex2'''
}
record9 = { 'publisher':'scientific.net',
'pub_website':'www.scientific.net',
'x_title':'//div[@class="paper-title"]/div[@class="paper-name"]/text()',
'x_abstract':'//div[@class="abstract"]/p/text()',
'x_people':'//div[text()="\r\n Authors\r\n "]/following-sibling::div/a',
'x_person':'string(.)',
'x_depts':'//none-here',#no affiliations given on page
'x_dept':'//none-here',
'x_date':'//div[text()="\r\n Online since\r\n "]/following-sibling::div/text()',
'date_con':'''
date = datetime.strptime(date.strip(),'%B %Y')''',
'name_con':'''
pex2 = []
for p in pex:
name = ""
p = re.sub(r"\s+", u" ", p, flags=re.UNICODE)
p = re.sub(',',u'',p)
words = p.split(" ")
if words[0] in [u'Dr.']:
words.remove(words[0])
for w in words[:-1]:
name+=w[0]
name+=u" "+words[-1]
pex2.append(name)
pex=pex2'''
}
record10 = { 'publisher':'jstage',
'pub_website':'www.jstage.jst.go.jp',
'x_title':'string(//*[contains(@class,"mod-article-heading")])',
'x_abstract':'string(//*[contains(@class,"mod-section")]/p)',
'x_people':'//*[contains(@class,"author")]/a',
'x_person':'text()',
'x_depts':'//*[contains(@class,"affiliation")]',
'x_dept':'text()',
'x_date':'string(//*[contains(@class,"date")])',
'date_con':'''
ds = re.sub("[^0-9]", "", date)
dates = [datetime.strptime(ds[i:i+8],'%Y%m%d') for i in range(0,len(ds),8)]
date = min(dates)''',
'name_con':'''
pex2 = []
for p in pex:
name = ""
p = re.sub(r"\s+", u" ", p, flags=re.UNICODE)
p = re.sub(',',u'',p)
words = p.split(" ")
if words[0] in [u'Dr.']:
words.remove(words[0])
for w in words[:-1]:
name+=w[0]
name+=u" "+words[-1]
pex2.append(name)
pex=pex2'''
}
record11 = { 'publisher':'rsc',
'pub_website':'pubs.rsc.org',
'x_title':'string(//div[@class="article_chemsoc_txt_s13"])',
'x_abstract':'string(//div[@class="abstract_new"])',
'x_people':'//div[@class="peptide_middle"]/div[1]/span',
'x_person':'a/text()',
'x_depts':'//div[@class="show_affiliation_section"]/div[position()>2]',
'x_dept':'div[2]/text()',
'x_date':'//div[@class="peptide_middle"]/span[last()]/text()[1]',
'date_con':'''
try:
d = [i.strip() for i in date.strip().split(' ')]
p = filter(lambda x: not(x==u''),d)
date = u' '.join(p[-3:])
date = datetime.strptime(date,'%d %b %Y')
except:
pass''',
'name_con':'''
pex2 = []
for p in pex:
name = ""
p = re.sub(r"\s+", u" ", p, flags=re.UNICODE)
p = re.sub(',',u'',p)
words = p.split(" ")
words = filter(lambda x: not(x==u''),words)
if words[0] in [u'Dr.']:
words.remove(words[0])
for w in words[:-1]:
name+=w[0]
name+=u" "+words[-1]
pex2.append(name)
pex=pex2'''
}
record12 = { 'publisher':'iop',
'pub_website':'iopscience.iop.org',
'x_title':'string(//*[@class="wd-jnl-art-title"])',
'x_abstract':'string(//div[contains(@class,"wd-jnl-art-abstract")]/p)',
'x_people':'//span[@data-authors]/span',
'x_person':'span[@itemprop="name"]/text()',
'x_depts':'//div[@class="wd-jnl-art-author-affiliations"]/p',
'x_dept':'text()',
'x_date':'//div[contains(@class,"wd-jnl-art-dates")]/p/text()',
'date_con':'''
d= date.strip()
datetime.strptime(' '.join(d.split(u' ')[-3:]),'%d %B %Y')''',
'name_con':'''
pex2 = []
for p in pex:
name = ""
p = re.sub(r"\s+", u" ", p, flags=re.UNICODE)
p = re.sub(',',u'',p)
words = p.split(" ")
words = filter(lambda x: not(x==u''),words)
if words[0] in [u'Dr.']:
words.remove(words[0])
for w in words[:-1]:
name+=w[0]
name+=u" "+words[-1]
pex2.append(name)
pex=pex2'''
}
record13 = { 'publisher':'RoyalSociety',
'pub_website':'rsif.royalsocietypublishing.org',
'x_title':'//*[@id="page-title"]/text()',
'x_abstract':'//*[@id="abstract-1"]/p/text()',
'x_people':'//span[@class="highwire-citation-authors"]/span',
'x_person':'string(.)',
'x_depts':'//span[@class="nlm-aff"]',
'x_dept':'string(.)',
'x_date':'//span[contains(@class,"highwire-cite-metadata-date")]/text()',
'date_con':'''
date=datetime.strptime(' '.join(date.split(u' ')[-3:]),"%d %B %Y")''',
'name_con':'''
pex2 = []
for p in pex:
name = ""
p = re.sub(r"\s+", u" ", p, flags=re.UNICODE)
p = re.sub(',',u'',p)
words = p.split(" ")
words = filter(lambda x: not(x==u''),words)
if words[0] in [u'Dr.']:
words.remove(words[0])
for w in words[:-1]:
name+=w[0]
name+=u" "+words[-1]
pex2.append(name)
pex=pex2'''
}
record14 = { 'publisher':'RoyalSociety',
'pub_website':'rspa.royalsocietypublishing.org',
'x_title':'//*[@id="page-title"]/text()',
'x_abstract':'//*[@id="abstract-1"]/p/text()',
'x_people':'//span[@class="highwire-citation-authors"]/span',
'x_person':'string(.)',
'x_depts':'//span[@class="nlm-aff"]',
'x_dept':'string(.)',
'x_date':'//span[contains(@class,"highwire-cite-metadata-date")]/text()',
'date_con':'''
date=datetime.strptime(' '.join(date.split(u' ')[-3:]),"%d %B %Y")''',
'name_con':'''
pex2 = []
for p in pex:
name = ""
p = re.sub(r"\s+", u" ", p, flags=re.UNICODE)
p = re.sub(',',u'',p)
words = p.split(" ")
words = filter(lambda x: not(x==u''),words)
if words[0] in [u'Dr.']:
words.remove(words[0])
for w in words[:-1]:
name+=w[0]
name+=u" "+words[-1]
pex2.append(name)
pex=pex2'''
}
record15 = { 'publisher':'Scitation',
'pub_website':'scitation.aip.org',
'x_title':'string(//*[@class="title-with-crossmark"])',
'x_abstract':'string(//*[contains(@class,"abstract ")]/descendant::*[@class="articleabstract"]/p[2])',
'x_people':'//span[contains(@class,"authors")]/a',
'x_person':'text()',
'x_depts':'//div[contains(@class,"affiliations")]/a',
'x_dept':'text()',
'x_date':'//div[contains(@class,"itemCitation")]/span[3]/text()',
'date_con':'''
d = date[:-1].split(u" ")
date=datetime.strptime(u' '.join(d[2:4] + [d[-1]]),"%b %d %Y")''',
'name_con':'''
pex2 = []
for p in pex:
name = ""
p = re.sub(r"\s+", u" ", p, flags=re.UNICODE)
p = re.sub(',',u'',p)
words = p.split(" ")
words = filter(lambda x: not(x==u''),words)
if words[0] in [u'Dr.']:
words.remove(words[0])
for w in words[:-1]:
name+=w[0]
name+=u" "+words[-1]
pex2.append(name)
pex=pex2'''
}
record16 = { 'publisher':'Nature',
'pub_website':'www.nature.com',
'x_title':'//*[@class="article-heading"]/text()',
'x_abstract':'string(//*[@id="abstract"]/div/p)',
'x_people':'//*[contains(@class,"authors citation-authors")]/li',
'x_person':'a[@class="name"]/span/text()',
'x_depts':'//ol[contains(@class,"affiliations")]/li',
'x_dept':'h3/text()',
'x_date':'//*[contains(@class,"citation dates")]/dd[1]/time/text()',
'date_con':'''
date=datetime.strptime(date.strip(),"%d %B %Y")''',
'name_con':'''
pex2 = []
for p in pex:
name = ""
p = re.sub(r"\s+", u" ", p, flags=re.UNICODE)
p = re.sub(',',u'',p)
words = p.split(" ")
words = filter(lambda x: not(x==u''),words)
if words[0] in [u'Dr.']:
words.remove(words[0])
for w in words[:-1]:
name+=w[0]
name+=u" "+words[-1]
pex2.append(name)
pex=pex2'''
}
record17 = { 'publisher':'degruyter',
'pub_website':'www.degruyter.com',
'x_title':'string(//*[@class="entryTitle"])',
'x_abstract':'string(//*[@class="articleBody_abstract"]/p)',
'x_people':'//*[@class="contributors"]/descendant-or-self::*',
'x_person':'./text()',
'x_depts':'//*[contains(@class,"NLM_affiliations")]/p',
'x_dept':'text()',
'x_date':'//*[contains(@id,"date-received")]/dd/text()',
'date_con':'''
datetime.strptime(date,"%Y-%m-%d")''',
'name_con':'''
pex2=[]
for p in pex:
if type(p)==list:
for p1 in p:
if (len(p1.strip())>3):
pex2.append(p1.strip())
elif p==[]:
pass
else:
if len(p.strip())>3:
pex2.append(p.strip())
pex3=[]
for p in pex2:
name = ""
p = re.sub(r"\s+", u" ", p, flags=re.UNICODE)
p = re.sub(',',u'',p)
p = re.sub('/',u'',p)
words = p.split(" ")
words = filter(lambda x: not(x==u''),words)
if words[0] in [u'Dr.']:
words.remove(words[0])
for w in words[:-1]:
name+=w[0]
name+=u" "+words[-1]
pex3.append(name)
pex=pex3'''
}
paths_list = [record1,record2,record3,record4,record5,record6,record7,record8,record9,record10,record11,record12,record13,record14,record15,record16,record17]
In [289]:
import requests
import scrapy
from scrapy.http import TextResponse
from datetime import datetime
import re
import pymongo
from pymongo import MongoClient
import json
import signal
import warnings
import sys
def get_paths(publisher,conn,doi):
###to do, build database with all of these
cur = ca.find({'pub_website':publisher})
if cur.count()==0:
raise Exception('no publisher website for '+ doi)
return cur
def get_publisher(response):
url = response.url
publisher = url.split('/')[2]
return publisher
def handler(signum,frame):
raise Exception()
def choose_items(items):
if len(items)==1:
return items[0]
else:
scores=[]
for it in items:
score = 0
for k,v in it.items():
if type(v)=='str':
v=v.strip()
if not(v==u''): score+=1
else:
if not(v==[]):
score+=1
scores.append(score)
print(scores)
return items[scores.index(max(scores))]
mongo_url = 'mongodb://localhost:27017/' #local
#mongo_url = 'mongodb://localhost:6666/' #remote
db = 'Cherry'
coll = 'CherryMunch'
client = MongoClient(mongo_url)
ca = client[db][coll]
ind=0
dead=0
with open('stuff.json','r') as f:
j = json.load(f)
for rec in j[0:1]:
print(ind)
ind+=1
item = {}
#doi = '10.1515/mgmc-2012-0023'
doi = '10.1039/9781847557612'
target_stub = 'http://dx.doi.org/'
target = target_stub + doi
die = True
r = requests.get(target)
response=TextResponse(r.url,body=r.text, encoding='utf-8')
###SENSE Who's publisher this is
pub = get_publisher(response)
###LOAD CORRECT XPATHS:
paths_list = get_paths(pub,ca,doi)
if False:
pass
else:
items = []
for paths in paths_list:
title= response.xpath(paths['x_title']).extract()[0]
abstract = response.xpath(paths['x_abstract']).extract()[0]
people = response.xpath(paths['x_people'])
depts = response.xpath(paths['x_depts'])
pex=[]
dex=[]
for person in people:
p = person.xpath(paths['x_person']).extract()
if not(p==[])and (len(p)==1):
pex.append(p[0])
else:
pex.append(p)
for dept in depts:
d = dept.xpath(paths['x_dept']).extract()
if not(d==[]):
dex.append(d[0])
exec paths['name_con']
date = response.xpath(paths['x_date']).extract()[0]
exec paths['date_con']
pex= list(set(pex))
dex = list(set(dex))
item = {
'title':title,
'authors':pex,
'depts':dex,
'abstract':abstract,
'date': date.strftime('%d %B %Y'),
'doi':doi,
'publisher':pub
}
items.append(item)
item = choose_items(items)
print(item)
print('proportion of records not collected: ' + str(dead))
In [ ]: