Analysis suite

program to analyse collection run quality


In [59]:
import json
import re
import datetime
scraped = 'scraped5'+'.json'
failed = 'losses5'+'.json'
with open(scraped,'r') as sr:
    with open(failed,'r') as fr:
        s = json.load(sr)
        f = json.load(fr)
        coll_no=0
        time_no=0
        miss_no=0
        coll_l=[]
        time_l=[]
        miss_l=[]
        for rec in f:
            if rec['error']=='collection':
                coll_no+=1
                coll_l.append(rec)
            if rec['error']=='timeout':
                time_no+=1
                time_l.append(rec)
            if rec['error']=='missing_pub':
                miss_no+=1
                miss_l.append(rec)

success_no=len(s)
fail_no=len(f)
parse_no = success_no+fail_no
conv_no = float(success_no)/float(parse_no)
coll_pub = [r['pub'] for r in coll_l]
time_pub = [r['pub'] for r in time_l]

print('*'*70)
print("Analysis of scraping run performed at "+datetime.datetime.now().strftime('%m:%H %d %B %Y'))
print('*'*70+'\n')
print('Total records parsed:                          '+ str(parse_no))
print('Total records successfully collected:          ' + str(success_no))
print('Conversion rate:                               '+'%.4f'%(conv_no*100)+'%')
print('\n')
print('-'*70)
print('Losses Breakdown:')
print('-'*70+'\n')
print('Total No. of records not converted:            '+ str(fail_no))
print('No. lost due to errors in collection:          '+ str(coll_no))
print('No. lost due to request timeouts:              '+ str(time_no))
print('No. lost due to missing publisher info:        '+str(miss_no))
print('\n')
print('-'*70)
print('Collection Errors Breakdown:')
print('-'*70+'\n')
print('Most collection errors for publisher:           '+max(set(coll_pub), key=coll_pub.count))
print('\n')
for i in set(coll_pub):
    pad = len(i)
    print('Collection Errors for '+i+' '*(47-22-pad)+str(coll_pub.count(i)) )
print('\n')
print('-'*70)
print('Collection Errors Breakdown:')
print('-'*70+'\n')
print('Most collection errors for publisher:           '+max(set(coll_pub), key=coll_pub.count))
print('\n')
for i in set(coll_pub):
    pad = len(i)
    print('Collection Errors for '+i+' '*(47-22-pad)+str(coll_pub.count(i)) )


**********************************************************************
Analysis of scraping run performed at 11:15 30 November 2015
**********************************************************************

Total records parsed:                          155
Total records successfully collected:          17
Conversion rate:                               10.9677%


----------------------------------------------------------------------
Losses Breakdown:
----------------------------------------------------------------------

Total No. of records not converted:            138
No. lost due to errors in collection:          29
No. lost due to request timeouts:              2
No. lost due to missing publisher info:        107


----------------------------------------------------------------------
Collection Errors Breakdown:
----------------------------------------------------------------------

Most collection errors for publisher:           www.nature.com


Collection Errors for onlinelibrary.wiley.com  1
Collection Errors for www.sciencedirect.com    1
Collection Errors for rsif.royalsocietypublishing.org1
Collection Errors for www.degruyter.com        5
Collection Errors for www.nature.com           6
Collection Errors for pubs.acs.org             6
Collection Errors for iopscience.iop.org       3
Collection Errors for link.springer.com        1
Collection Errors for pubs.rsc.org             5

In [14]:
datetime.datetime.now().strftime('%m:%H %d %B %Y')


Out[14]:
'11:15 30 November 2015'

In [ ]: