In [10]:
import csv
import sys
sys.path.append("..")
from gitenberg.util.catalog import repo_for_pgid
class book:
def __init__(self,_id):
self.book_id = _id
done_ids=set()
with open('/Documents/gitenberg/done.txt','r') as f:
for vals in csv.reader(f,delimiter='\t', quotechar='"'):
if len(vals) < 2:
print vals
else:
done_ids.add(int(vals[1]))
print "{} are done".format(len(done_ids))
missing=set()
with open('../assets/missing.tsv','r') as f:
for vals in csv.reader(f,delimiter='\t', quotechar='"'):
missing.add(int(vals[0]))
print "{} are missing".format(len(missing))
for pg_id in repo_for_pgid.keys()[100:200]:
if pg_id not in done_ids and pg_id not in missing:
print repo_for_pgid[pg_id]
for pg_id in list(missing):
pg_book=book(pg_id)
metadata=BookMetadata(pg_book,rdf_library=rdf_library, enrich=False)
try:
metadata.parse_rdf()
except TypeError as e:
print 'Error in {}'.format(pg_id)
print e
break
if metadata.gutenberg_type != u'Text':
missing.remove(pg_id)
print '{}\t{}'.format(pg_id,metadata.gutenberg_type)
if metadata.gutenberg_issued == None:
missing.remove(pg_id)
print '{}\tWithdrawn'.format(pg_id)
In [14]:
import re
re.match(r'^\d+$', '23').group(0)
Out[14]:
In [ ]: