In [10]:
import csv
import sys
sys.path.append("..")
from gitenberg.util.catalog import repo_for_pgid

class book:
    def __init__(self,_id):
        self.book_id = _id
done_ids=set()
with open('/Documents/gitenberg/done.txt','r') as f:
    for vals in csv.reader(f,delimiter='\t', quotechar='"'):
        if len(vals) < 2:
            print vals
        else:
            done_ids.add(int(vals[1]))
print "{} are done".format(len(done_ids))
missing=set()
with open('../assets/missing.tsv','r') as f:
    for vals in csv.reader(f,delimiter='\t', quotechar='"'):
        missing.add(int(vals[0]))
print "{} are missing".format(len(missing))

for pg_id in repo_for_pgid.keys()[100:200]:
    if pg_id not in done_ids and pg_id not in missing:
        print repo_for_pgid[pg_id]

for pg_id in list(missing):
    pg_book=book(pg_id)
    metadata=BookMetadata(pg_book,rdf_library=rdf_library, enrich=False)
    try:
        metadata.parse_rdf()
    except TypeError as e:
        print 'Error in {}'.format(pg_id)
        print e
        break
    if metadata.gutenberg_type != u'Text':
        missing.remove(pg_id) 
        print '{}\t{}'.format(pg_id,metadata.gutenberg_type)
    if metadata.gutenberg_issued == None:
        missing.remove(pg_id) 
        print '{}\tWithdrawn'.format(pg_id)


5700 are done
1353 are missing
Inaugural-Address-of-Franklin-Delano-Roosevelt--13-Given-in-Washington-D.C.-March-4th-1933_104
Far-from-the-Madding-Crowd_107
The-Return-of-Sherlock-Holmes_108
Renascence-and-Other-Poems_109
Freckles_111
Violists_112
United-States-Census-Figures-Back-to-1630_115
Symphony-No.-5-in-C-minor-Opus-67_117
Big-Dummy-s-Guide-to-the-Internet_118
A-Tramp-Abroad_119
The-Return-of-the-Native_122
At-the-Earth-s-Core_123
A-Girl-of-the-Limberlost_125
The-Poison-Belt_126
The-Arabian-Nights-Entertainments_128
The-Pilgrim-s-Progress-from-this-world-to-that-which-is-to-come--13-Delivered-under-the-similit__131
Maria-or-the-Wrongs-of-Woman_134
Daddy-Long-Legs_157
American-Hand-Book-of-the-Daguerreotype_167
The-1994-CIA-World-Factbook_180
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-406023ba00e1> in <module>()
     27 for pg_id in list(missing):
     28     pg_book=book(pg_id)
---> 29     metadata=BookMetadata(pg_book,rdf_library=rdf_library, enrich=False)
     30     try:
     31         metadata.parse_rdf()

NameError: name 'BookMetadata' is not defined

In [14]:
import re
re.match(r'^\d+$', '23').group(0)


Out[14]:
'23'

In [ ]: