This is a quick demo of what sort of data I'm pulling out with Hathi-Specific features of the Bookworm-MARC library.

First just some basic imports, including from this library.


In [ ]:
import pymarc
import random
import json
from bookwormMARC.bookwormMARC import parse_record
from bookwormMARC.hathi_methods import hathi_record_yielder
from bookwormMARC.bookwormMARC import LCCallNumber

import bookwormMARC
import sys
import os
from collections import defaultdict
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [ ]:
all_files = hathi_record_yielder(["/Users/bschmidt/Downloads/bb3.xml"])

In [ ]:
rec = all_files.next()
rec.hathi_bookworm_dicts().next()


Out[ ]:
{'cataloging_source': u' ',
 'cntry': u'enk',
 'contributing_library': 'Unknown',
 'date': 1883,
 'filename': u'coo1.ark:/13960/t5q81vr3g',
 'first_author_birth': 1816,
 'first_author_death': 1909,
 'first_author_name': u'Martin, Theodore, Sir, 1816-1909.',
 'first_place': u'London :',
 'first_publisher': u'J. Murray,',
 'government_document': u' ',
 'item_date': 1883,
 'language': u'eng',
 'lc0': 'D',
 'lc1': 'DA',
 'lc2': '536.',
 'lc_class_from_lc': True,
 'literary_form': 'Not fiction',
 'marc_record_created': u'1982-12-10',
 'permalink': u'https://babel.hathitrust.org/cgi/pt?id=coo1.ark:/13960/t5q81vr3g',
 'record_date': 1883,
 'resource_type': 'book',
 'rights_changed_date': u'2015-08-20',
 'scanner': u'cornell-ms',
 'searchstring': u'<a href=https://babel.hathitrust.org/cgi/pt?id=coo1.ark:/13960/t5q81vr3g><em>A life of Lord Lyndhurst from letters and papers in possession of his family.</em> (1883)',
 'serial_killer_guess': 'serial',
 'target_audience': 'Unknown or not specified',
 'title': u'A life of Lord Lyndhurst from letters and papers in possession of his family.'}

In [ ]:
rec.hathi_bookworm_dicts().next()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-ff8466ca3247> in <module>()
----> 1 rec.hathi_bookworm_dicts().next()

NameError: name 'rec' is not defined

Example output

Here is an example of the output of this script on Hathi books: 5 randomly selected records from the first 50000 or so in the DPLA dump. This is usually, note, more than 5 items: Hathi groups multiple items into a single record.

Note that we're using a custom superset of the pymarc.Record class called BRecord. This adds a number of functions that make it easier--for instance--to pull out a dictionary with the categories that may be useful for Bookworms in a variety of ways.

Each of the keys here is something that might make sense to chart or analyze. We want to know the scanner so that we can see if there are OCR effects or something that might be relevant. We want the library so we can see how shifting library composition affects time series. It might make sense to build up miniature bookworms for particular authors, or publishers, etc.

To start with, I just print four random records from the first 500 or so.


In [ ]:
from bookwormMARC.bookwormMARC import BRecord

n=0
for rec in all_files:
    if random.random()>.5:
        print "."
        continue
    for entry in rec.hathi_bookworm_dicts():
        # Pretty print the dictionary entry.
        print json.dumps(entry,sort_keys=True, indent=2, separators=(',', ': ') )
        print ""
    n+=1
    if n>4:
        break


{
  "cataloging_source": " ",
  "cntry": "mau",
  "contributing_library": "Cornell University",
  "date": 1900,
  "filename": "coo1.ark:/13960/t9s18j800",
  "first_author_birth": 1862,
  "first_author_death": 1930,
  "first_author_name": "Stratemeyer, Edward, 1862-1930.",
  "first_place": "Boston,",
  "first_publisher": "Lee and Shepard,",
  "government_document": " ",
  "item_date": 1900,
  "language": "eng",
  "lc0": "P",
  "lc1": "PZ",
  "lc2": "7.",
  "lc_class_from_lc": true,
  "literary_form": "Fiction",
  "marc_record_created": "1972-07-24",
  "permalink": "https://babel.hathitrust.org/cgi/pt?id=coo1.ark:/13960/t9s18j800",
  "record_date": 1900,
  "resource_type": "book",
  "rights_changed_date": "2015-07-19",
  "scanner": "cornell-ms",
  "searchstring": "<a href=https://babel.hathitrust.org/cgi/pt?id=coo1.ark:/13960/t9s18j800><em>The campaign of the jungle; or, Under Lawton through Luzon /</em> (1900)",
  "serial_killer_guess": "book",
  "subject_places": [
    "a-ph---"
  ],
  "target_audience": "Unknown or not specified",
  "title": "The campaign of the jungle; or, Under Lawton through Luzon /"
}

{
  "cataloging_source": " ",
  "cntry": "mau",
  "contributing_library": "Cornell University",
  "date": 1910,
  "filename": "coo1.ark:/13960/t8qc0kj10",
  "first_author_birth": 1803,
  "first_author_death": 1882,
  "first_author_name": "Emerson, Ralph Waldo, 1803-1882.",
  "first_place": "Boston ;",
  "first_publisher": "Houghton Mifflin Company,",
  "government_document": " ",
  "item_date": 1910,
  "language": "eng",
  "lc0": "P",
  "lc1": "PS",
  "lc2": "1633",
  "lc_class_from_lc": true,
  "literary_form": "Letters",
  "marc_record_created": "1972-08-09",
  "permalink": "https://babel.hathitrust.org/cgi/pt?id=coo1.ark:/13960/t8qc0kj10",
  "record_date": 1910,
  "resource_type": "book",
  "rights_changed_date": "2015-07-21",
  "scanner": "cornell-ms",
  "searchstring": "<a href=https://babel.hathitrust.org/cgi/pt?id=coo1.ark:/13960/t8qc0kj10><em>Records of a lifelong friendship, 1807-1882 : Ralph Waldo Emerson and William Henry Furness /</em> (1910)",
  "serial_killer_guess": "book",
  "target_audience": "Unknown or not specified",
  "title": "Records of a lifelong friendship, 1807-1882 : Ralph Waldo Emerson and William Henry Furness /"
}

{
  "cataloging_source": "u",
  "cntry": "nyu",
  "contributing_library": "isrlf",
  "date": 1807,
  "filename": "uc2.ark:/13960/t1zc7t220",
  "first_author_birth": 1742,
  "first_author_death": 1823,
  "first_author_name": "Combe, William, 1742-1823.",
  "first_place": "Troy, N.Y. :",
  "first_publisher": "Printed and sold by Wright, Goodenow, & Stockwell,",
  "government_document": " ",
  "item_date": 1807,
  "language": "eng",
  "literary_form": "Not fiction",
  "marc_record_created": "1972-08-10",
  "permalink": "https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/t1zc7t220",
  "record_date": 1807,
  "resource_type": "book",
  "rights_changed_date": "2013-08-10",
  "scanner": "ia",
  "searchstring": "<a href=https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/t1zc7t220><em>Letters of the late Lord Lyttleton; to which is now added, a memoir concerning the author, including an account of some extraordinary circumstances attending his death.</em> (1807)",
  "serial_killer_guess": "book",
  "target_audience": "Unknown or not specified",
  "title": "Letters of the late Lord Lyttleton; to which is now added, a memoir concerning the author, including an account of some extraordinary circumstances attending his death."
}

.
{
  "cataloging_source": " ",
  "cntry": "enk",
  "contributing_library": "Cornell University",
  "date": 1883,
  "filename": "coo1.ark:/13960/t5q81vr3g",
  "first_author_birth": 1816,
  "first_author_death": 1909,
  "first_author_name": "Martin, Theodore, Sir, 1816-1909.",
  "first_place": "London :",
  "first_publisher": "J. Murray,",
  "government_document": " ",
  "item_date": 1883,
  "language": "eng",
  "lc0": "D",
  "lc1": "DA",
  "lc2": "536.",
  "lc_class_from_lc": true,
  "literary_form": "Not fiction",
  "marc_record_created": "1982-12-10",
  "permalink": "https://babel.hathitrust.org/cgi/pt?id=coo1.ark:/13960/t5q81vr3g",
  "record_date": 1883,
  "resource_type": "book",
  "rights_changed_date": "2015-08-20",
  "scanner": "cornell-ms",
  "searchstring": "<a href=https://babel.hathitrust.org/cgi/pt?id=coo1.ark:/13960/t5q81vr3g><em>A life of Lord Lyndhurst from letters and papers in possession of his family.</em> (1883)",
  "serial_killer_guess": "serial",
  "target_audience": "Unknown or not specified",
  "title": "A life of Lord Lyndhurst from letters and papers in possession of his family."
}

{
  "cataloging_source": "d",
  "cntry": "enk",
  "contributing_library": "inrlf",
  "date": 1920,
  "filename": "uc2.ark:/13960/t1sf2n69b",
  "first_author_name": "MacCarthy, Edward Thomas.",
  "first_place": "London,",
  "first_publisher": "Routledge,",
  "government_document": " ",
  "item_date": 1920,
  "language": "eng",
  "literary_form": "Not fiction",
  "marc_record_created": "1985-04-27",
  "permalink": "https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/t1sf2n69b",
  "record_date": 1920,
  "resource_type": "book",
  "rights_changed_date": "2015-08-05",
  "scanner": "ia",
  "searchstring": "<a href=https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/t1sf2n69b><em>Further incidents in the life of a mining engineer,</em> (1920)",
  "serial_killer_guess": "book",
  "target_audience": "Unknown or not specified",
  "title": "Further incidents in the life of a mining engineer,"
}

Experimental: testing the goodness of record 043 codes.


In [ ]:
if False:
    all_files = hathi_record_yielder()
    knowledge = open("/drobo/knowledge_directions.tsv","w")

    for record in all_files:
        if record['043'] is not None:
            try:
                dicto = record.bookworm_dict()
                subjects = dicto['subject_places']
                p1 = record.first_place()
                cntry = dicto['cntry']
                year = dicto['date']
                for subject in subjects:
                    knowledge.write("\t".join(map(str,[subject,p1,year,cntry,record['001'].value(),dicto['title'].encode("utf-8")]))+ "\n")
            except:
                pass

The world of available fields

This code creates a list of fields that appear in more than 10% of a randomly selected subset of records. They include the control fields; author and title information; and some more esoteric things including country of study.


In [ ]:
from collections import defaultdict
n = 0
global_counts = defaultdict(int)

for record in all_files:
    if random.random() >.2:
        continue
    already_seen = set([])
    n+=1
    from collections import defaultdict
    for dicto in record.as_dict()['fields']:
        name = dicto.keys()[0]
        if 'subfields' in dicto[name]:
            for subfield in dicto[name]['subfields']:
                tupo = (name,subfield.keys()[0])
        else:
            tupo = (name,None)
        if not tupo in already_seen:
            global_counts[tupo] +=1
            already_seen.add(tupo)
    if n > 10000:
        break

In [ ]:
a = [((k,v),count) for ((k,v),count) in global_counts.iteritems()]
a.sort()
for elem in a:
    if elem[1] > 1000:
        print elem

Better years

One of the big things I've noticed is that the 974 field has better year information than the record information, such as individual fields.

The following block shows that something like 1 in 3 items, in about one in ten records, have a different entry in the 974y field from the native date field. That suggests huge possibilities for improving dates if we're not already using the 974y fields: I suspect we are not based on the serial volumes that include 974y fields I see in the online browser.


In [ ]:
all_files = hathi_record_yielder()

import collections

records = 0
diff_records = 0
items = 0
diff_items = 0
date_diffs = collections.defaultdict(int)
for rec in all_files:
    if random.random() > .1:
        # Print just one in one hundred files each time for debugging
        continue
    records += 1
    line_counted = False
    for dicto in rec.hathi_bookworm_dicts():
        try:
            if dicto["item_date"] != dicto["record_date"]:
                date_diffs[(dicto["item_date"],dicto["record_date"])] += 1
        except KeyError:
            pass
    if records>1000:
        break
print "%i out of %i records and %i out of %i items have differing dates" %(diff_records,records,diff_items,items)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-8-dc250696dd2b> in <module>()
----> 1 all_files = hathi_record_yielder()
      2 
      3 import collections
      4 
      5 records = 0

TypeError: hathi_record_yielder() takes at least 1 argument (0 given)

In [ ]:

Assessing differences in dates between 974 and the main MARC record

]The most common pattern is that I'm replacing a "None" value with an actual year, or vice versa. It would be wise to see if there isn't sometimes a better solution than the Nones for the original fields. (Eg; am I overrelying on F008?)


In [ ]:
flattened = sorted([(-val,f008,f974,val) for ((f974,f008),val) in date_diffs.iteritems()])
flattened[:20]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-9-4c779291654c> in <module>()
----> 1 flattened = sorted([(-val,f008,f974,val) for ((f974,f008),val) in date_diffs.iteritems()])
      2 flattened[:20]

NameError: name 'date_diffs' is not defined

Looking at only places where we have years, most are the realm of reasonableness here. (With just 1000 examples, I'm certainly getting a lot of repeat entries.)

There are, though, a number of places where f974 instates an earlier entry than does the native date field.


In [ ]:
flattened = sorted([(-val,f008,f974,val) for ((f974,f008),val) in date_diffs.iteritems() 
                    if f974 is not None and f008 is not None])
flattened[:20]

Let's look to see what those are. Here are thirty.


In [ ]:
records = 0
for rec in all_files:
    if random.random() > .5:
        # Print just one in two for debugging
        continue
    rec.__class__ = BRecord
    for field in rec.get_fields('974'):
        items += 1
        if str(field['y']) != str(rec.date()) and field['y'] is not None and rec.date() is not None:
            if int(field['y']) < int(rec.date()):
                records += 1
                print rec
    if records>30:
        break

The basic problem in all of these seems to be that in the original record, field 260c and field 008 disagree on the date. Pymarc prefers 260 in these cases; Zephir prefers field 008. Fair enough.

Cataloging Errors: What country is Vienna in?

I've noticed that cataloging librarians often just dump the city "Wien" into Germany because, I guess, they look only at the language. How often does this happen? Here's some code to check, 1000 entries at a time.


In [ ]:
n = 0
right = 0
wrong = 0
all_files = hathi_record_yielder()

In [ ]:
n = 0

for rec in all_files:
    n += 1
    if n>100000:
        break
    dicto =  rec.bookworm_dict()
    try:
        if "Wien" in dicto['first_place'] and not "und Wien" in dicto['first_place']:
            if dicto['cntry'] != 'au ' and dicto['cntry'] != 'au#' and dicto['cntry'] != "xx ":
                wrong += 1
                #a = bookwormMARC.bookwormMARC.F008(rec)
                print ("oops", dicto['cntry'],dicto['first_place'])
            else:
                #print dicto['cntry']
                right += 1
    except KeyError:
        continue
try:
    print "So far, {} wrong and {} right ({:0.2f}%)".format(wrong,right,100*float(wrong)/float(wrong+right))
except ZeroDivisionError:
    print "Haven't found any yet: try running again"


('oops', u'gw ', u'Wien,')
So far, 7 wrong and 85 right (7.61%)

In [ ]:


In [ ]:
n = 0

dictee = defaultdict(int)

for rec in all_files:
    n+=1
    if n > 10:
        break
        
    for item in rec.hathi_bookworm_dicts():
        if item['serial_killer_guess'] != item['resource_type']:
            print item['title']
        try:
            dictee[(item['serial_killer_guess'],item['resource_type'])] += 1
        except KeyError:
            pass
        break
for (k,v) in dictee.iteritems():
    print (k,v)

In [ ]:
bar


Out[ ]:
{'cataloging_source': u' ',
 'cntry': u'nyu',
 'contributing_library': u'nrlf',
 'date': 1949,
 'filename': u'uc1.$b243514',
 'first_author_birth': 1902,
 'first_author_name': u'Meeker, Arthur, 1902-',
 'first_place': u'New York,',
 'first_publisher': u'A. A. Knopf,',
 'government_document': u' ',
 'item_date': 1949,
 'language': u'eng',
 'lc0': 'P',
 'lc1': 'PZ',
 'lc2': '3.',
 'lc_class_from_lc': True,
 'literary_form': 'Fiction',
 'marc_record_created': u'1985-04-08',
 'permalink': u'https://babel.hathitrust.org/cgi/pt?id=uc1.$b243514',
 'record_date': 1949,
 'resource_type': 'book',
 'rights_changed_date': u'2013-10-15',
 'scanner': u'google',
 'searchstring': u'<a href=https://babel.hathitrust.org/cgi/pt?id=uc1.$b243514><em>Prairie Avenue.</em> (1949)',
 'serial_killer_guess': 'book',
 'target_audience': 'Unknown or not specified',
 'title': u'Prairie Avenue.'}

In [ ]:
import re
testrow = "CLASS C - AUXILIARY SCIENCES OF HISTORY"
file = open("/drobo/hathi_metadata/vocabularies/lc_class.txt","r")
line = line.rstrip("\n")
line = line.rstrip("\r")

In [ ]:
def parse_row(string):
    """
    Each row is 4-tuple; the level,
    lower bound,
    the upper bound, and the label.
    """
    l1 = re.search(r"CLASS ([A-Z]) - (\w+)",string)
    if l1:
        groups = l1.groups()
        return (0,groups[0],groups[0],groups[1])
    l2 = re.search(r"Subclass ([A-Z]{1,3})",string)
    if l2:
        groups = l2.groups()
        return (1,groups[0],groups[0],None)
    l3 = re.search(r'([A-Z]+)\(?(\d+(?:\.[A-Z]?\d*)?)\)?-?\(?(\d+(?:\.[A-Z]?\d*)?)?\)?(\t+)(.*)',string)
    if l3:
        groups = list(l3.groups())
        if groups[2] is None:
            groups[2] = groups[1]
        return (1 + len(groups[3]),groups[1],groups[2],groups[4])

line = file.readline()
if line =="\n": line = file.readline()

print line
print parse_row(line)

In [ ]:
for i in xrange(20):
    foo = all_files.next()
    print foo['008'].data[32] + '-' + foo['008'].data[33]

In [ ]:
i = 0
output = open("/drobo/jsoncatalog.txt","w")
all_files = hathi_record_yielder()
for record in hathi_record_yielder():
    for item in record.hathi_bookworm_dicts():
        output.write(json.dumps(item) + "\n")
        i += 1
        if i % 100000 == 0:
            print "{} complete".format(i)


100000 complete
200000 complete
300000 complete
400000 complete
500000 complete
600000 complete
700000 complete
800000 complete
900000 complete
1000000 complete
1100000 complete
1200000 complete
1300000 complete
1400000 complete
1500000 complete
1600000 complete
1700000 complete
1800000 complete
1900000 complete
2000000 complete
2100000 complete
2200000 complete
2300000 complete
2400000 complete
2500000 complete
2600000 complete
2700000 complete
2800000 complete
2900000 complete
3000000 complete
3100000 complete
3200000 complete
3300000 complete
3400000 complete
3500000 complete
3600000 complete
3700000 complete
3800000 complete
3900000 complete
4000000 complete
4100000 complete
4200000 complete
4300000 complete
4400000 complete
4500000 complete
4600000 complete
4700000 complete
4800000 complete
4900000 complete
5000000 complete
5100000 complete
5200000 complete
5300000 complete
5400000 complete
5500000 complete

In [ ]:



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-ea91ade2ae20> in <module>()
----> 1 rec['082']

NameError: name 'rec' is not defined