This notebook downloads files that occur in JSON files generated by the Econbiz API (Example), tries to determine the corresponding RePec handle, obtains the citations count information (via citEc API) and stores those information.
A directory called data including the subdirectories pdf, json, failed will be created in the working directory.
pdf contains the PDF filesjson includes the corresponding meta data.failed keeps track of files that couldn't be downloadedThe meta data of a file with name pdf/foobar.pdf can be found in json/foobar.pdf.json.
The code in this notebook utilizes multiple APIs. It does this in a way that applies a lot of workload onto the services. Therefore, you (more precisely: your IP address) could be blacklisted which precludes you from using that service (temporarily). To mitigate this issue, the programm creates cache files whenever possible.
The first run of this program will take a lot of time (depending on your machine and your internet connection, but we are talking about hours), so be patient. The subsequent runs are much faster (less than a minute), because the local caches will be used.
In [ ]:
import urllib2
import json
maxNumDocs = 200000
def readData(path='repec.json'):
'''
helper function that reads json data and
converts it to python objects
'''
with open(path) as f:
return json.load(f)
def apiToJson(url, toFile=True, cacheFile='repec.json'):
'''
Queries `url` and stores the result to `repec.json`. By overriding
the `cacheFile` parameter the result will be written into another
file. If `toFile` is set to false, the function will return the
object instead of persisting it
'''
eBData = urllib2.urlopen(url)
eBData = json.loads(eBData.read())
if toFile and (type(cacheFile) == str or type(cacheFile) == unicode) and len(cacheFile) > 0:
with open(cacheFile, 'w') as f:
json.dump(eBData, f)
elif toFile == False:
return json.dumps(eBData)
else:
raise ArgumentValidationError('If `toFile` is set to True you need to pass a valid path in the `cacheFile` parameter')
def citationCount(repecHdl):
'''
Return citation counts from RePec's citec API
'''
# do we have a valid repec-handle?
if type(repecHdl) == str or type(repecHdl) == unicode:
citecUrl = 'http://citec.repec.org/api/plain/' + repecHdl + '/us435'
try:
citationData = xmltodict.parse(urllib2.urlopen(citecUrl, timeout=10).read())
except:
raise URLError('Couldn\'t fetch data. Check you Configuration and' + \
'the availability of http://citec.repec.org')
else:
if citationData.has_key('errorString'):
raise IOError(citationData['errorString'])
if citationData.has_key('citationData'):
citedBy = citationData['citationData']['citedBy']
cites = citationData['citationData']['cites']
return {'citedBy': citedBy, 'cites': cites}
else:
return {'citedBy': None, 'cites': None}
else:
raise TypeError('You need to pass a string')
def mkDir(dir):
'''
creates a dir with name `dir` if it doesn't exist
'''
if not os.path.exists(dir):
os.makedirs(dir)
def validateURL(url):
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4
r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
return regex.match(url) != None
In [ ]:
import json
import os
import urllib2
import IPython
import ipykernel as kernel
connection_file_path = kernel.get_connection_file()
connection_file = os.path.basename(connection_file_path)
kernel_id = connection_file.split('-', 1)[1].split('.')[0]
def executeCell(x=0):
''' executes the code in cell no 'x' (zero-based indexing)
'''
sessions = json.load(urllib2.urlopen('http://127.0.0.1:8888/api/sessions'))
ipynbFileName = ""
for sess in sessions:
if sess['kernel']['id'] == kernel_id:
ipynbFileName = sess['notebook'][u'path']
ipynbFileName = ipynbFileName.split(os.sep)[-1]
break
# read this notebook's file
if ipynbFileName != "":
with open(ipynbFileName) as f:
nb = json.load(f)
# locate cell's code
if type(nb) == dict:
try:
code = ""
if nb[u'cells'][x][u'cell_type'] == u'code':
for s in nb[u'cells'][x]['source']:
code += s
else:
raise TypeError("The cell you request is not of type 'code'")
except IndexError:
raise IndexError('No cell #' + str(x))
# execute
get_ipython().run_cell(code)
In order to receive citation count data from RePec for a given Econbiz document the corresponding RePec handle (a unique identifier) is required. Unfortunately, there is no straight-forward way to do so. This notebook implements three ways to obtain the RePec handle. They are presented during the course of this notebook.
This method obtains the RePec handle through two stages of indirection from a given Econbiz ID (e.g. 10011374989).
/record method of the Econbiz APIidentifier_number field
In [ ]:
import json
import urllib2
import re
from urllib2 import URLError
import xmltodict
import os
def determineRepecHandle_WolfgangsMethod():
cacheFile = 'wolfgangsCache.json'
# Build LookUpTable
if os.path.exists(cacheFile):
with open(cacheFile) as f:
lut = json.load(f)
else:
lut = {}
def fetchRepecHandler(id):
# Pass the Econbiz ID an receive the RePec handler (if exists)
try:
econbizRecordURL = 'http://api.econbiz.de/v1/record/' + id
except TypeError:
raise TypeError('You need to pass the id as a str or unicode.')
try:
# fetch more details corresponding to current item
# looking for a handle.net handle
itemMetadata = urllib2.urlopen(econbizRecordURL).read().decode('utf8')
itemMetadata = json.loads(itemMetadata)
except Exception:
raise IOError("Couldn't read ressource. Not a JSON file?")
else:
hdlStrings = ""
for identifier_url in itemMetadata['record']['identifier_number']:
# is it a handle.net handle?
if re.match(r'(hdl:)?[0-9]{4,6}/[0-9]{3,6} \[[H|h]andle\]', identifier_url) != None:
match = re.search(r'[0-9]{4,6}/[0-9]{3,6}', identifier_url)
if match != None:
hdlStrings = match.group().split('/')
# do we have a valid handle.net-handle?
if type(hdlStrings) == list:
handleToRepecUrl = 'http://www.econstor.eu/repec/handleToRepec/' + hdlStrings[0] + '/' + hdlStrings[1] + '.txt'
try:
return urllib2.urlopen(handleToRepecUrl).read()
except URLError:
return None
def lookup(id):
# read cache file an return repec handler if existing
if lut.has_key(id):
return lut[id]
# handler not in local cache. fetch and persist it
repecHandler = fetchRepecHandler(id)
lut.update({id: repecHandler})
with open(cacheFile, 'w') as f:
json.dump(lut, f)
return repecHandler
return lookup
In [ ]:
wolfgangsMetadataFile = 'wolfgangsMetadata.json'
if not os.path.exists(wolfgangsMetadataFile):
apiToJson(url='http://api.econbiz.de/v1/search?q=source:econstor+identifier_url:pdf&secret=Z-8_uu&size=' + str(maxNumDocs) + '&fields=title,identifier_url,person,date,id', cacheFile=wolfgangsMetadataFile)
data = readData(wolfgangsMetadataFile)
hasRepec = 0
numDocs = len(data['hits']['hits'])
lookup = determineRepecHandle_WolfgangsMethod()
for i, item in enumerate(data['hits']['hits']):
try:
repecHdl = lookup(item['id'])
except:
# we don't care about any errors ;)
continue
if repecHdl != None:
hasRepec += 1
if i % 1000 == 0:
print "{:.1f}% finished".format((i/float(numDocs))*100)
print "\nRESULT:\n{:.1f}% items have a repec handle".format((hasRepec/float(numDocs))*100)
In [ ]:
import urllib2
import json
import os
henningsMetadataFile = 'henningsMetadata.json'
def determineRepecHandle_HenningsMethod():
'''
For efficiency reasons (using closures), this methods returns
a methods that allows querying the dataset using an Econbiz ID,
instead of doing the job itself.
'''
cacheFile = 'henningsCache.json'
if os.path.exists(cacheFile):
with open(cacheFile) as f:
lut = json.load(f)
else:
if not os.path.exists(henningsMetadataFile):
apiToJson(url='http://api.econbiz.de/v1/search?q=source:econstor+identifier_url:pdf&secret=Z-8_uu&size=' + str(maxNumDocs) + '&fields=title,identifier_url,person,date,id,identifier_repec', cacheFile=henningsMetadataFile)
eBData = readData(henningsMetadataFile)
lut = {i['id']: i['identifier_repec'] for i in eBData['hits']['hits'] if i.has_key('identifier_repec')}
with open(cacheFile, 'w') as f:
json.dump(lut, f)
def lookup(id):
if type(id) != str and type(id) != unicode:
raise TypeError('You need to pass the id as a str or unicode.')
try:
return lut[id]
except KeyError:
return None
return lookup
In [ ]:
if not os.path.exists(henningsMetadataFile):
apiToJson(url='http://api.econbiz.de/v1/search?q=source:econstor+identifier_url:pdf&secret=Z-8_uu&size=' + str(maxNumDocs) + '&fields=title,identifier_url,person,date,id,identifier_repec', cacheFile=henningsMetadataFile)
data = readData(henningsMetadataFile)
hasRepec = 0
numDocs = len(data['hits']['hits'])
lookup = determineRepecHandle_HenningsMethod()
for i, item in enumerate(data['hits']['hits']):
try:
id = lookup(item['id'])
except:
# we don't care about any errors ;)
continue
if id != None:
hasRepec += 1
if i % 1000 == 0:
print "{:.1f}% finished".format((i/float(numDocs))*100)
print "\nRESULT:\n{:.1f}% items have a repec handle".format((hasRepec/float(numDocs))*100)
In [ ]:
data = readData('henningsMetadata.json') # picked hennings file randonly
henningsMethod = determineRepecHandle_HenningsMethod()
wolfgangsMethod = determineRepecHandle_WolfgangsMethod()
henningsSet = set()
wolfgangsSet = set()
numDocs = len(data['hits']['hits'])
for i, item in enumerate(data['hits']['hits']):
try:
eBId = item['id']
except TypeError:
continue
else:
try:
henningsId = henningsMethod(eBId)
wolfgangsId = wolfgangsMethod(eBId)
except:
continue
else:
if henningsId != None:
henningsSet.add(henningsId)
if wolfgangsId != None:
wolfgangsSet.add(wolfgangsId)
print '\nRESULT:\nWolfgang without Henning: {}\nHenning without Wolfgang: {}' \
.format(str(len(wolfgangsSet.difference(henningsSet))), str(len(henningsSet.difference(wolfgangsSet))))
print'Henning: {}\nWolfgang: {}\nWolfgang and Henning: {}'.format(str(len(henningsSet)), str(len(wolfgangsSet)), str(len(wolfgangsSet.union(henningsSet))))
The third way to obtain RePec handles is based on RePec's search engine called IDEAS. It's obviously build to interface humans, but it can also be used by robots, as we do it. This methods produces a RePec handle given the title of a document. It mimics a human user that queries the search engine with the title of a document. It "clicks" the first match (if there is a match) and extract the desired information from the detail page. It should be noted that this method is fragile and error-prone. In case the layout of the website changes, the corresponding xPath's need to be adapted appropriately. Moreover, this method can produce false-positive values. There is no guarantee that the RePec handle is the one you where looking for. But random sampling showed, that the results are quite reasonable (supposedly, because the titles from EconBiz match those on RePec very well, although not perfectly).
In [ ]:
import urllib
import urllib2
import json
from lxml import etree
import unicodedata
import re
def determineRepecHandle_ideasCrawler(query):
numResults = '1'
xpathFirstResult = '//*[@id="content-block"]/dl/dt/a'
def genXpathRepecDetailPage(row=1, col=1, bold=False):
xpath = u'//*[@id="biblio-body"]/table/tr[' + str(row) + ']/td[' + str(col) + ']'
if bold == True:
xpath += '/b'
return xpath
# normalize text
# replaces e.g. ä with a
unicodedata.normalize("NFKD", query).encode("ascii", "ignore").decode("utf8")
# remove everthing that's not alphanumeric
query = re.sub(r'[^A-Za-z0-9 ]*', '', query)
# percentage encoding
queryPercentageEncoded = urllib.quote_plus(query)
htmlParser = etree.HTMLParser()
# Request result list
ideasHdl = urllib2.urlopen('http://ideas.repec.org/cgi-bin/htsearch?ul=&q=' + queryPercentageEncoded + '&cmd=Search%21&wf=4BFF&s=R&dt=range&db=&de=&m=all&fmt=long&sy=1&ps=' + numResults)
# parse received page
tree = etree.parse(ideasHdl, htmlParser)
# find first match in result list
match = tree.xpath(xpathFirstResult)
# is there a match?
if len(match) > 0:
urlsDetailPages = match[0].values()
else:
return None
for url in urlsDetailPages:
if validateURL(url):
detailPageHdl = urllib2.urlopen(url)
detailsPageTree = etree.parse(detailPageHdl, htmlParser)
for i in xrange(1, 10):
'''
Go through the table util you find 'Handle:'. In this
case return value from the same row and the next
column, which is hopefully the RePEc handle
'''
xpath = genXpathRepecDetailPage(row=i, col=1, bold=True)
matchRow = detailsPageTree.xpath(xpath)
if len(matchRow) > 0 and matchRow[0].text.strip() == 'Handle:':
xpath = genXpathRepecDetailPage(row=i, col=2)
matchCol = detailsPageTree.xpath(xpath)
if len(matchCol) > 0 and matchCol[0].text != None:
return matchCol[0].text
else:
return None
return None
In [ ]:
def fetchRepecHandlerByEBId():
henningsMethod = determineRepecHandle_HenningsMethod()
wolfgangsMethod = determineRepecHandle_WolfgangsMethod()
def lookup(id):
# hennings method
repecHdl = henningsMethod(id)
if repecHdl != None:
return repecHdl
# wolfgangs method
repecHdl = wolfgangsMethod(id)
if repecHdl != None:
return repecHdl
return None
return lookup
Now that we have all components in place, let's glue them together. What the code roughly does, for every item in the meta data file, is:
In [ ]:
import json
import os
import urllib2
import logging
import re
import xmltodict
import sys
import time
repecDelay = 0.5
logging.getLogger().setLevel(logging.INFO)
wd = os.getcwd() + os.sep + '..' + os.sep + 'data'
metadataFile = henningsMetadataFile
failedPath = 'failedToDownload.json'
lookupRepecHdl = fetchRepecHandlerByEBId()
with open(metadataFile, "r") as data_file:
data = json.load(data_file)
if data.has_key("hits") and data["hits"].has_key("hits"):
data = data["hits"]["hits"]
else:
raise Exception("unknown Datastructure")
# create directories if not existing
pdfDir = wd + os.sep + u'pdf'
jsonDir = wd + os.sep + u'json'
failDir = wd + os.sep + os.sep + u'failed'
for f in (pdfDir, jsonDir, failDir):
mkDir(f)
u = ""
failedDownloads = []
for itemNumber, item in enumerate(data):
url = item["identifier_url"][0]
filename = url.split("/")[-1]
# download the pdf file
try:
if not os.path.exists(pdfDir + os.sep + filename):
u = urllib2.urlopen(url)
with open(pdfDir + os.sep + filename, 'w') as f:
f.write(u.read())
logging.log(logging.INFO, filename + " successfully downloaded.")
else:
logging.log(logging.INFO, filename + " skipped download. Already downloaded.")
except Exception as e:
logging.log(logging.INFO, url + " couldn't be opened.")
failedDownloads.append(item)
logging.error(logging.ERROR, e)
continue
else:
# write meta data to json file
jsonFile = os.path.join(jsonDir, filename + '.json')
if os.path.exists(jsonFile):
with open(jsonFile, 'r') as f:
try:
itemFromFile = json.load(f)
except ValueError:
itemFromFile = {}
else:
itemFromFile = {}
if itemFromFile.has_key('citedBy') and \
itemFromFile.has_key('cites'):
logging.log(logging.INFO, filename + u'.json skipped. Has citations counts already')
continue
else:
citeCount = None
# try to obtain repec handle
repecHdl = lookupRepecHdl(item['id'])
if repecHdl == None:
# no handle so far. maybe we can find one on repec
title = ""
for s in item['title']:
title += s + ' '
title = title.strip()
try:
repecHdl = determineRepecHandle_ideasCrawler(title)
except Exception as e:
if e.args:
errString = "Error while crawling RePEc. Error was:\n" + unicode(e.args[0])
else:
errString = "Error while crawling RePEc."
logging.log(logging.INFO, errString)
else:
if repecHdl == None:
logging.log(logging.INFO, "Couldn't obtained RePec Handle for " + unicode(filename))
else:
logging.log(logging.INFO, "Obtained RePec Handle from ideas:" + unicode(repecHdl))
if repecHdl != None:
# Fetch citation count figures
try:
citeCount = citationCount(repecHdl.strip())
# let's be kind and lower the workload
time.sleep(repecDelay)
except IOError as e:
if e.args[0].find('exceeded') != -1:
logging.log(logging.INFO, "Unfortunately you may have been blacklisted by the citec-API.")
#sys.exit('citec service unavailable')
else:
logging.log(logging.INFO, "No citation count data available for this document")
if citeCount == None:
citeCount = {'citedBy': None, 'cites': None}
itemFromFile.update(citeCount)
itemFromFile.update(item)
with open(jsonDir + os.sep + filename + u'.json', 'w') as jf:
json.dump(itemFromFile, jf)
logging.log(logging.INFO, filename + u'.json updated')
logging.log(logging.INFO, '{:.2f}% finished'.format(float(itemNumber)*100/len(data)))
if len(failedDownloads) > 0:
handler = open(failDir + os.sep + failedPath, "w")
handler.write(json.dumps(failedDownloads))
logging.log(logging.INFO, "Downloads complete.")