This is a quick demo of what sort of data I'm pulling out with Hathi-Specific features of the Bookworm-MARC library.
First just some basic imports, including from this library.
In [ ]:
import pymarc
import random
import json
from bookwormMARC.bookwormMARC import parse_record
from bookwormMARC.hathi_methods import hathi_record_yielder
from bookwormMARC.bookwormMARC import LCCallNumber
import bookwormMARC
import sys
import os
from collections import defaultdict
%load_ext autoreload
%autoreload 2
In [ ]:
all_files = hathi_record_yielder(["/Users/bschmidt/Downloads/bb3.xml"])
In [ ]:
rec = all_files.next()
rec.hathi_bookworm_dicts().next()
Out[ ]:
In [ ]:
rec.hathi_bookworm_dicts().next()
Here is an example of the output of this script on Hathi books: 5 randomly selected records from the first 50000 or so in the DPLA dump. This is usually, note, more than 5 items: Hathi groups multiple items into a single record.
Note that we're using a custom superset of the pymarc.Record class called BRecord
. This adds a number of functions that make it easier--for instance--to pull out a dictionary with the categories that may be useful for Bookworms in a variety of ways.
Each of the keys here is something that might make sense to chart or analyze. We want to know the scanner so that we can see if there are OCR effects or something that might be relevant. We want the library so we can see how shifting library composition affects time series. It might make sense to build up miniature bookworms for particular authors, or publishers, etc.
To start with, I just print four random records from the first 500 or so.
In [ ]:
from bookwormMARC.bookwormMARC import BRecord
n=0
for rec in all_files:
if random.random()>.5:
print "."
continue
for entry in rec.hathi_bookworm_dicts():
# Pretty print the dictionary entry.
print json.dumps(entry,sort_keys=True, indent=2, separators=(',', ': ') )
print ""
n+=1
if n>4:
break
Experimental: testing the goodness of record 043 codes.
In [ ]:
if False:
all_files = hathi_record_yielder()
knowledge = open("/drobo/knowledge_directions.tsv","w")
for record in all_files:
if record['043'] is not None:
try:
dicto = record.bookworm_dict()
subjects = dicto['subject_places']
p1 = record.first_place()
cntry = dicto['cntry']
year = dicto['date']
for subject in subjects:
knowledge.write("\t".join(map(str,[subject,p1,year,cntry,record['001'].value(),dicto['title'].encode("utf-8")]))+ "\n")
except:
pass
In [ ]:
from collections import defaultdict
n = 0
global_counts = defaultdict(int)
for record in all_files:
if random.random() >.2:
continue
already_seen = set([])
n+=1
from collections import defaultdict
for dicto in record.as_dict()['fields']:
name = dicto.keys()[0]
if 'subfields' in dicto[name]:
for subfield in dicto[name]['subfields']:
tupo = (name,subfield.keys()[0])
else:
tupo = (name,None)
if not tupo in already_seen:
global_counts[tupo] +=1
already_seen.add(tupo)
if n > 10000:
break
In [ ]:
a = [((k,v),count) for ((k,v),count) in global_counts.iteritems()]
a.sort()
for elem in a:
if elem[1] > 1000:
print elem
One of the big things I've noticed is that the 974 field has better year information than the record information, such as individual fields.
The following block shows that something like 1 in 3 items, in about one in ten records, have a different entry in the 974y field from the native date field. That suggests huge possibilities for improving dates if we're not already using the 974y fields: I suspect we are not based on the serial volumes that include 974y fields I see in the online browser.
In [ ]:
all_files = hathi_record_yielder()
import collections
records = 0
diff_records = 0
items = 0
diff_items = 0
date_diffs = collections.defaultdict(int)
for rec in all_files:
if random.random() > .1:
# Print just one in one hundred files each time for debugging
continue
records += 1
line_counted = False
for dicto in rec.hathi_bookworm_dicts():
try:
if dicto["item_date"] != dicto["record_date"]:
date_diffs[(dicto["item_date"],dicto["record_date"])] += 1
except KeyError:
pass
if records>1000:
break
print "%i out of %i records and %i out of %i items have differing dates" %(diff_records,records,diff_items,items)
In [ ]:
]The most common pattern is that I'm replacing a "None" value with an actual year, or vice versa. It would be wise to see if there isn't sometimes a better solution than the Nones for the original fields. (Eg; am I overrelying on F008?)
In [ ]:
flattened = sorted([(-val,f008,f974,val) for ((f974,f008),val) in date_diffs.iteritems()])
flattened[:20]
Looking at only places where we have years, most are the realm of reasonableness here. (With just 1000 examples, I'm certainly getting a lot of repeat entries.)
There are, though, a number of places where f974 instates an earlier entry than does the native date field.
In [ ]:
flattened = sorted([(-val,f008,f974,val) for ((f974,f008),val) in date_diffs.iteritems()
if f974 is not None and f008 is not None])
flattened[:20]
Let's look to see what those are. Here are thirty.
In [ ]:
records = 0
for rec in all_files:
if random.random() > .5:
# Print just one in two for debugging
continue
rec.__class__ = BRecord
for field in rec.get_fields('974'):
items += 1
if str(field['y']) != str(rec.date()) and field['y'] is not None and rec.date() is not None:
if int(field['y']) < int(rec.date()):
records += 1
print rec
if records>30:
break
The basic problem in all of these seems to be that in the original record, field 260c and field 008 disagree on the date. Pymarc prefers 260 in these cases; Zephir prefers field 008. Fair enough.
In [ ]:
n = 0
right = 0
wrong = 0
all_files = hathi_record_yielder()
In [ ]:
n = 0
for rec in all_files:
n += 1
if n>100000:
break
dicto = rec.bookworm_dict()
try:
if "Wien" in dicto['first_place'] and not "und Wien" in dicto['first_place']:
if dicto['cntry'] != 'au ' and dicto['cntry'] != 'au#' and dicto['cntry'] != "xx ":
wrong += 1
#a = bookwormMARC.bookwormMARC.F008(rec)
print ("oops", dicto['cntry'],dicto['first_place'])
else:
#print dicto['cntry']
right += 1
except KeyError:
continue
try:
print "So far, {} wrong and {} right ({:0.2f}%)".format(wrong,right,100*float(wrong)/float(wrong+right))
except ZeroDivisionError:
print "Haven't found any yet: try running again"
In [ ]:
In [ ]:
n = 0
dictee = defaultdict(int)
for rec in all_files:
n+=1
if n > 10:
break
for item in rec.hathi_bookworm_dicts():
if item['serial_killer_guess'] != item['resource_type']:
print item['title']
try:
dictee[(item['serial_killer_guess'],item['resource_type'])] += 1
except KeyError:
pass
break
for (k,v) in dictee.iteritems():
print (k,v)
In [ ]:
bar
Out[ ]:
In [ ]:
import re
testrow = "CLASS C - AUXILIARY SCIENCES OF HISTORY"
file = open("/drobo/hathi_metadata/vocabularies/lc_class.txt","r")
line = line.rstrip("\n")
line = line.rstrip("\r")
In [ ]:
def parse_row(string):
"""
Each row is 4-tuple; the level,
lower bound,
the upper bound, and the label.
"""
l1 = re.search(r"CLASS ([A-Z]) - (\w+)",string)
if l1:
groups = l1.groups()
return (0,groups[0],groups[0],groups[1])
l2 = re.search(r"Subclass ([A-Z]{1,3})",string)
if l2:
groups = l2.groups()
return (1,groups[0],groups[0],None)
l3 = re.search(r'([A-Z]+)\(?(\d+(?:\.[A-Z]?\d*)?)\)?-?\(?(\d+(?:\.[A-Z]?\d*)?)?\)?(\t+)(.*)',string)
if l3:
groups = list(l3.groups())
if groups[2] is None:
groups[2] = groups[1]
return (1 + len(groups[3]),groups[1],groups[2],groups[4])
line = file.readline()
if line =="\n": line = file.readline()
print line
print parse_row(line)
In [ ]:
for i in xrange(20):
foo = all_files.next()
print foo['008'].data[32] + '-' + foo['008'].data[33]
In [ ]:
i = 0
output = open("/drobo/jsoncatalog.txt","w")
all_files = hathi_record_yielder()
for record in hathi_record_yielder():
for item in record.hathi_bookworm_dicts():
output.write(json.dumps(item) + "\n")
i += 1
if i % 100000 == 0:
print "{} complete".format(i)
In [ ]: