In [1]:
%matplotlib inline
from __future__ import print_function
# Standard gobbldy-gook for hand Spark stuff
import os
from pyspark import SQLContext
from pyspark.sql import Row
import pyspark.sql.functions as sql
import pyspark.sql.types as types
#from pyspark.sql.functions import udf, length
import matplotlib.pyplot as plt
import numpy
import math
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import pyspark.ml.feature as feature

import unicodecsv
from dateutil.parser import parse
#from dateparser.date import DateDataParser


:0: FutureWarning: IPython widgets are experimental and may change in the future.

In [2]:
sqlContext = SQLContext(sc)

In [3]:
data_dir = "data/data-20160516"

In [4]:
#parser = DateDataParser()
def type_data_subject(l):
    try:
        return (
            int(l["TitleID"]),
            l["Subject"],
            parse(l["CreationDate"])
        )
    except:
        return False

def schema_subject():
    return types.StructType([
        types.StructField("titleid", types.IntegerType(), True),
        types.StructField("subject", types.StringType(), True),
        types.StructField("creationdate", types.DateType(), True)
        ])

# Read a file with python's csv reader into a df - single threaded and
# inefficient but csv reading is not garanteed to be line-paralizable
# and Python's parsing code is more known/hackable than Spark's
def t_gen(fn, parse_method):
    i = 1 # start row number at 1 due to header
    errors = 0
    with open(fn) as f:
        # encoding specified as 'utf-8-sig' since dumps have byte order mark
        f_tsv = unicodecsv.DictReader(f, encoding='utf-8-sig', dialect="excel-tab")
        for l in f_tsv:
            i += 1
            row = parse_method(l)
            if row is not False:
                yield row
            else:
                errors += 1
                print("Error with {0} on line {1}".format(l, i))
                if errors > 50:
                    print("Too many errors, stopping.")
                    break

In [ ]:
fn = os.path.join(data_dir, "subject.txt")
df = sqlContext.createDataFrame(t_gen(fn, type_data_subject), schema_subject())

In [ ]:
print(df.schema)
df.head(10)

In [5]:
def as_int(s):
    return None if (s is None) or (len(s.strip()) is 0) else int(s)

def as_date(s):
    return None if (s is None) or (len(s.strip()) is 0) else parse(s)
    

def type_data_item(l):
    try:
        return (
            as_int(l["ItemID"]),
            as_int(l["TitleID"]),
            as_int(l["ThumbnailPageID"]),
            l["BarCode"],
            l["MARCItemID"],
            l["CallNumber"],
            l["VolumeInfo"],
            l["ItemURL"],
            l["LocalID"],
            l["Year"],
            l["InstitutionName"],
            l["ZQuery"],
            as_date(l["CreationDate"])
        )
    except Exception as e:
        print(e)
        #raise
        return False

def schema_item():
    return types.StructType([
        types.StructField("itemid", types.IntegerType(), True),
        types.StructField("titleid", types.IntegerType(), True),
        types.StructField("thumbnailpageid", types.IntegerType(), True),
        types.StructField("barcode", types.StringType(), True),
        types.StructField("marcitemid", types.StringType(), True),
        types.StructField("callnumber", types.StringType(), True),
        types.StructField("volumeinfo", types.StringType(), True),
        types.StructField("itemurl", types.StringType(), True),
        types.StructField("localid", types.StringType(), True),
        types.StructField("year", types.StringType(), True),
        types.StructField("institutionname", types.StringType(), True),
        types.StructField("zquery", types.StringType(), True),
        types.StructField("creationdate", types.DateType(), True)
        ])

fn = os.path.join(data_dir, "item.txt")
df = sqlContext.createDataFrame(t_gen(fn, type_data_item), schema_item())


invalid literal for int() with base 10: '2009-05-03 00:01'
Error with {u'ItemID': u' 1049434', u'VolumeInfo': None, u'LocalID': None, u'CallNumber': None, u'BarCode': None, u'InstitutionName': None, u'TitleID': u'2009-05-03 00:01', u'ItemURL': None, u'ZQuery': None, u'ThumbnailPageID': None, u'Year': None, u'CreationDate': None, u'MARCItemID': None} on line 15548
invalid literal for int() with base 10: '1820-1848.'
Error with {u'ItemID': u' 1263520', u'VolumeInfo': None, u'LocalID': None, u'CallNumber': None, u'BarCode': u'local=1126352', u'InstitutionName': None, u'TitleID': u'1820-1848.', u'ItemURL': None, u'ZQuery': None, u'ThumbnailPageID': u'American Museum of Natural History Library', u'Year': None, u'CreationDate': None, u'MARCItemID': u'2009-05-10 00:03'} on line 20780
year is out of range
Error with {u'ItemID': u'91798', None: [u'2010-03-16 00:00'], u'VolumeInfo': u'', u'LocalID': u'', u'CallNumber': u'', u'BarCode': u'introductiontobo00stev', u'InstitutionName': u'Internet Archive', u'TitleID': u'41634', u'ItemURL': u'http://www.biodiversitylibrary.org/item/91798 ', u'ZQuery': u'local= ', u'ThumbnailPageID': u'28916380', u'Year': u'', u'CreationDate': u'4671941', u'MARCItemID': u'introductiontobo00stev'} on line 87399
year is out of range
Error with {u'ItemID': u'100634', None: [u'2010-09-26 00:02'], u'VolumeInfo': u'v.1', u'LocalID': u'', u'CallNumber': u'', u'BarCode': u'beitraegezueiner01jira', u'InstitutionName': u'University Library, University of Illinois Urbana Champaign', u'TitleID': u'46547', u'ItemURL': u'http://www.biodiversitylibrary.org/item/100634 ', u'ZQuery': u'local= ', u'ThumbnailPageID': u'32296197', u'Year': u'', u'CreationDate': u'6171504', u'MARCItemID': u'beitraegezueiner01jira'} on line 137525
year is out of range
Error with {u'ItemID': u'100722', None: [u'2010-09-26 00:05'], u'VolumeInfo': u'1', u'LocalID': u'', u'CallNumber': u'', u'BarCode': u'carolilinneqvi01linn', u'InstitutionName': u'University Library, University of Illinois Urbana Champaign', u'TitleID': u'46560', u'ItemURL': u'http://www.biodiversitylibrary.org/item/100722 ', u'ZQuery': u'local= ', u'ThumbnailPageID': u'32344859', u'Year': u'', u'CreationDate': u'2692043', u'MARCItemID': u'carolilinneqvi01linn'} on line 137610
year is out of range
Error with {u'ItemID': u'100733', None: [u'2010-09-26 00:06'], u'VolumeInfo': u'v.2', u'LocalID': u'', u'CallNumber': u'', u'BarCode': u'versucheineranle02bats', u'InstitutionName': u'University Library, University of Illinois Urbana Champaign', u'TitleID': u'46561', u'ItemURL': u'http://www.biodiversitylibrary.org/item/100733 ', u'ZQuery': u'local= ', u'ThumbnailPageID': u'32352053', u'Year': u'1787', u'CreationDate': u'2761124', u'MARCItemID': u'versucheineranle02bats'} on line 137621
year is out of range
Error with {u'ItemID': u'100912', None: [u'2010-10-03 00:00'], u'VolumeInfo': u'1', u'LocalID': u'', u'CallNumber': u'', u'BarCode': u'ladiesbotanyofpr01lind', u'InstitutionName': u'University Library, University of Illinois Urbana Champaign', u'TitleID': u'46637', u'ItemURL': u'http://www.biodiversitylibrary.org/item/100912 ', u'ZQuery': u'local= ', u'ThumbnailPageID': u'32421691', u'Year': u'', u'CreationDate': u'1967977', u'MARCItemID': u'ladiesbotanyofpr01lind'} on line 139462
year is out of range
Error with {u'ItemID': u'100913', None: [u'2010-10-03 00:00'], u'VolumeInfo': u'2', u'LocalID': u'', u'CallNumber': u'', u'BarCode': u'ladiesbotanyofpr02lind', u'InstitutionName': u'University Library, University of Illinois Urbana Champaign', u'TitleID': u'46637', u'ItemURL': u'http://www.biodiversitylibrary.org/item/100913 ', u'ZQuery': u'local= ', u'ThumbnailPageID': u'32422011', u'Year': u'', u'CreationDate': u'1967977', u'MARCItemID': u'ladiesbotanyofpr02lind'} on line 139463
year is out of range
Error with {u'ItemID': u'100939', None: [u'2010-10-03 00:01'], u'VolumeInfo': u'2', u'LocalID': u'', u'CallNumber': u'', u'BarCode': u'illustrationsofn02twin', u'InstitutionName': u'University Library, University of Illinois Urbana Champaign', u'TitleID': u'46651', u'ItemURL': u'http://www.biodiversitylibrary.org/item/100939 ', u'ZQuery': u'local= ', u'ThumbnailPageID': u'32429745', u'Year': u'1868', u'CreationDate': u'1939852', u'MARCItemID': u'illustrationsofn02twin'} on line 139488
year is out of range
Error with {u'ItemID': u'100940', None: [u'2010-10-03 00:01'], u'VolumeInfo': u'1', u'LocalID': u'', u'CallNumber': u'', u'BarCode': u'illustrationsofn01twin', u'InstitutionName': u'University Library, University of Illinois Urbana Champaign', u'TitleID': u'46651', u'ItemURL': u'http://www.biodiversitylibrary.org/item/100940 ', u'ZQuery': u'local= ', u'ThumbnailPageID': u'32430239', u'Year': u'1868', u'CreationDate': u'1939852', u'MARCItemID': u'illustrationsofn01twin'} on line 139489
year is out of range
Error with {u'ItemID': u'100969', None: [u'2010-10-03 00:02'], u'VolumeInfo': u'v.23 [t.1]', u'LocalID': u'', u'CallNumber': u'', u'BarCode': u'histoirenaturell01spa', u'InstitutionName': u'University Library, University of Illinois Urbana Champaign', u'TitleID': u'44839', u'ItemURL': u'http://www.biodiversitylibrary.org/item/100969 ', u'ZQuery': u'local= ', u'ThumbnailPageID': u'32442319', u'Year': u'1848', u'CreationDate': u'6170378', u'MARCItemID': u'histoirenaturell01spa'} on line 139516
year is out of range
Error with {u'ItemID': u'101065', None: [u'2010-10-04 12:18'], u'VolumeInfo': u'1', u'LocalID': u'', u'CallNumber': u'', u'BarCode': u'carolilinnsyst01lin', u'InstitutionName': u'University Library, University of Illinois Urbana Champaign', u'TitleID': u'46735', u'ItemURL': u'http://www.biodiversitylibrary.org/item/101065 ', u'ZQuery': u'local= ', u'ThumbnailPageID': u'32467516', u'Year': u'1797', u'CreationDate': u'2692044', u'MARCItemID': u'carolilinnsyst01lin'} on line 139600
year is out of range
Error with {u'ItemID': u'101105', None: [u'2010-10-04 12:19'], u'VolumeInfo': u'1', u'LocalID': u'', u'CallNumber': u'', u'BarCode': u'atlasdesplantesd01bois', u'InstitutionName': u'University Library, University of Illinois Urbana Champaign', u'TitleID': u'46768', u'ItemURL': u'http://www.biodiversitylibrary.org/item/101105 ', u'ZQuery': u'local= ', u'ThumbnailPageID': u'32478969', u'Year': u'1896', u'CreationDate': u'2935749', u'MARCItemID': u'atlasdesplantesd01bois'} on line 139639
year is out of range
Error with {u'ItemID': u'101132', None: [u'2010-10-04 12:22'], u'VolumeInfo': u'1', u'LocalID': u'', u'CallNumber': u'', u'BarCode': u'indexperfectusad01muel', u'InstitutionName': u'University Library, University of Illinois Urbana Champaign', u'TitleID': u'46792', u'ItemURL': u'http://www.biodiversitylibrary.org/item/101132 ', u'ZQuery': u'local= ', u'ThumbnailPageID': u'32486942', u'Year': u'1880', u'CreationDate': u'1903077', u'MARCItemID': u'indexperfectusad01muel'} on line 139666
year is out of range
Error with {u'ItemID': u'101594', None: [u'2010-10-31 00:00'], u'VolumeInfo': u'1', u'LocalID': u'', u'CallNumber': u'', u'BarCode': u'nouvellesdcouv01need', u'InstitutionName': u'University Library, University of Illinois Urbana Champaign', u'TitleID': u'47035', u'ItemURL': u'http://www.biodiversitylibrary.org/item/101594 ', u'ZQuery': u'local= ', u'ThumbnailPageID': u'32661915', u'Year': u'', u'CreationDate': u'2167572', u'MARCItemID': u'nouvellesdcouv01need'} on line 141317
year is out of range
Error with {u'ItemID': u'101644', None: [u'2010-10-31 00:02'], u'VolumeInfo': u'1', u'LocalID': u'', u'CallNumber': u'', u'BarCode': u'nomenclatorbotan01steu', u'InstitutionName': u'University Library, University of Illinois Urbana Champaign', u'TitleID': u'47064', u'ItemURL': u'http://www.biodiversitylibrary.org/item/101644 ', u'ZQuery': u'local= ', u'ThumbnailPageID': u'32680700', u'Year': u'', u'CreationDate': u'6171478', u'MARCItemID': u'nomenclatorbotan01steu'} on line 141367
invalid literal for int() with base 10: '(1767)'
Error with {u'ItemID': u' (1767)', u'VolumeInfo': u'2014-04-06 00:02', u'LocalID': None, u'CallNumber': u'', u'BarCode': u'1767', u'InstitutionName': None, u'TitleID': u'http://www.biodiversitylibrary.org/item/151919 ', u'ItemURL': None, u'ZQuery': None, u'ThumbnailPageID': u'32044105172852', u'Year': None, u'CreationDate': None, u'MARCItemID': u'Harvard University Botany Libraries'} on line 158629

In [22]:
df.head(3)


Out[22]:
[Row(itemid=202851, titleid=116786, thumbnailpageid=50414006, barcode=u'CAT31301223', marcitemid=u'CAT31301223', callnumber=u'', volumeinfo=u'1918', itemurl=u'http://www.biodiversitylibrary.org/item/202851 ', localid=u'', year=u'', institutionname=u'U.S. Department of Agriculture, National Agricultural Library', zquery=u'', creationdate=datetime.date(2016, 4, 24)),
 Row(itemid=202852, titleid=116787, thumbnailpageid=50414108, barcode=u'CAT31301224', marcitemid=u'CAT31301224', callnumber=u'', volumeinfo=u'1918', itemurl=u'http://www.biodiversitylibrary.org/item/202852 ', localid=u'', year=u'', institutionname=u'U.S. Department of Agriculture, National Agricultural Library', zquery=u'', creationdate=datetime.date(2016, 4, 24)),
 Row(itemid=202853, titleid=116788, thumbnailpageid=50414118, barcode=u'CAT31301689', marcitemid=u'CAT31301689', callnumber=u'', volumeinfo=u'1918', itemurl=u'http://www.biodiversitylibrary.org/item/202853 ', localid=u'', year=u'', institutionname=u'U.S. Department of Agriculture, National Agricultural Library', zquery=u'', creationdate=datetime.date(2016, 4, 24))]

In [6]:
# Now, how to load the OCR text? 

def mk_ocr_fn(dir_name, barcode):
    return os.path.join(mirror_dir, barcode) + "_djvu.txt"

mirror_dir = "data/mirror"

def get_ocr(barcode):
    try:
        with open(mk_ocr_fn(mirror_dir, barcode), 'r') as f:
            ocr_text = f.read()
    except Exception as e:
        #print(e)
        ocr_text = None
        
    return ocr_text        

print(get_ocr("CAT31293222"))


Historic, Archive Document 

Do not assume content reflects current 
scientific knowledge, policies, or practices. 



i 






In [7]:
# How about we make a new column as a calculation
# first, lets make a little dataset
#df2 = df.sample(withReplacement=False, fraction=0.1)
df2 = df
print(df2.count())

get_ocr_udf = sql.udf(get_ocr, types.StringType())

df_ocr = df2.withColumn("ocrtext", get_ocr_udf(df["barcode"]))


181444

In [8]:
df_ocr.filter(df_ocr["ocrtext"].isNotNull()).count()


Out[8]:
167766

In [10]:
#df_ocr_sm = df_ocr.sample(withReplacement=False, fraction=0.001)

In [9]:
df_ocr.write.parquet("data/first_draft4.parquet")

In [ ]: