In [1]:
import csv
import sys
sys.path.append("..")
from gitenberg.util.catalog import BookMetadata
rdf_library='/Documents/gitenberg/cache/epub'
exit
class book:
    def __init__(self,_id):
        self.book_id = _id
pg_ids=set()
with open('../gitenberg/data/GITenberg_repo_list.tsv','r') as f:
    for vals in csv.reader(f,delimiter='\t', quotechar='"'):
        pg_ids.add(int(vals[0]))
missing=[]
for pg_id in range(1,51117):
    if pg_id not in pg_ids:
        missing.append(pg_id)
for pg_id in list(missing):
    pg_book=book(pg_id)
    metadata=BookMetadata(pg_book,rdf_library=rdf_library, enrich=False)
    try:
        metadata.parse_rdf()
    except TypeError as e:
        print 'Error in {}'.format(pg_id)
        print e
        break
    if metadata.gutenberg_type != u'Text':
        missing.remove(pg_id) 
        print '{}\t{}'.format(pg_id,metadata.gutenberg_type)
    if metadata.gutenberg_issued == None:
        missing.remove(pg_id) 
        print '{}\tWithdrawn'.format(pg_id)
        
print '{}missing texts'.format(len(missing))
for pg_id in missing:
    print pg_id


182	Withdrawn
183	Withdrawn
184	Withdrawn
185	Withdrawn
186	Withdrawn
187	Withdrawn
188	Withdrawn
189	Withdrawn
190	Withdrawn
191	Withdrawn
192	Withdrawn
193	Withdrawn
194	Withdrawn
195	Withdrawn
196	Withdrawn
197	Withdrawn
198	Withdrawn
199	Withdrawn
758	StillImage
1070	Withdrawn
1071	Withdrawn
1072	Withdrawn
1647	Withdrawn
1648	Withdrawn
1766	Withdrawn
1767	Withdrawn
1789	Withdrawn
1914	Withdrawn
1964	Withdrawn
1984	Withdrawn
2001	Withdrawn
2091	Withdrawn
2200	Withdrawn
2623	Withdrawn
2624	Withdrawn
2625	Withdrawn
2626	Withdrawn
2738	Withdrawn
2877	Withdrawn
2879	Withdrawn
3002	Sound
3018	Withdrawn
3201	Dataset
3501	Dataset
3502	Dataset
3503	Dataset
3504	Dataset
3505	Dataset
3506	Dataset
3507	Dataset
3508	Dataset
3509	Dataset
3510	Dataset
3511	Dataset
3512	Dataset
3513	Dataset
3514	Dataset
3515	Dataset
3516	Dataset
3517	Dataset
3518	Dataset
3519	Dataset
3520	Dataset
3521	Dataset
3522	Dataset
3523	Dataset
3524	Dataset
4749	Image
4750	Image
4751	Image
4949	Image
4950	Image
4951	Image
5188	Image
5189	Image
5190	Image
5613	Withdrawn
5627	Image
5634	Image
5635	Image
5714	Image
5885	Image
5886	Image
6084	Image
6536	Sound
6871	Image
7092	Image
7093	Image
7507	Image
7536	Withdrawn
9116	Withdrawn
9117	Withdrawn
9118	Withdrawn
9119	Withdrawn
9120	Withdrawn
9121	Withdrawn
9122	Withdrawn
9123	Withdrawn
9124	Withdrawn
9125	Withdrawn
9126	Withdrawn
9127	Withdrawn
9128	Withdrawn
9129	Withdrawn
9130	Withdrawn
9131	Withdrawn
9132	Withdrawn
9133	Withdrawn
9134	Withdrawn
9135	Withdrawn
9136	Withdrawn
9137	Withdrawn
9138	Withdrawn
9139	Withdrawn
9140	Withdrawn
9141	Withdrawn
9142	Withdrawn
9144	Withdrawn
9392	Image
11220	Collection
11775	Dataset
11776	Dataset
11777	Dataset
11778	Dataset
11779	Dataset
11780	Dataset
11781	Dataset
11782	Dataset
11783	Dataset
11784	Dataset
11785	Dataset
11786	Dataset
11787	Dataset
11788	Dataset
11789	Dataset
11790	Dataset
11791	Dataset
11792	Dataset
11793	Dataset
11794	Dataset
11795	Dataset
11796	Dataset
11797	Dataset
11798	Dataset
44missing texts
181
565
576
616
622
637
672
676
757
771
772
835
931
933
934
1018
1073
1255
1464
1691
2152
2184
2421
2576
2720
2733
2869
3057
3169
3278
3279
3500
3541
3680
3696
4366
4387
6161
6348
6349
6419
6420
6937
7053

In [ ]:


In [ ]: