In [1]:
import requests
from goodreads import client
import pandas as pd
In [ ]:
# This this is the URL prefix common for each record
url_prefix = 'https://www.goodreads.com/book/isbn_to_id/0441172717,0739467352?key='
In [2]:
# Setting up the GoodReads API client
file = open('goodreads_credentials')
key , secret = [element.strip() for element in file.readlines()]
gc = client.GoodreadsClient(key,secret)
In [16]:
df = pd.read_csv('Combine.csv',index_col=0)
In [17]:
all_isbn = df.isbn.unique()
In [18]:
isbn_df = pd.DataFrame(all_isbn,columns=['isbn'])
In [8]:
c = 0
def get_details(isbn):
global c
c+=1
if(c%100 == 0):
print(c)
try:
b = gc.book(isbn=isbn)
return pd.Series({'title':b.title,'description':b.description,'num_pages':b.num_pages})
except:
return pd.Series({'title':'none','description':'none','num_pages':'none'})
In [21]:
isbn_df[['description','num_pages','title']] = isbn_df.isbn.apply(get_details)
In [22]:
isbn_df.to_pickle('ibsn_features_full.pickle')
In [3]:
isbn_df = pd.read_pickle('ibsn_features_full.pickle')
In [23]:
dfx = isbn_df[isbn_df['title'] == 'none']
In [26]:
dfx.head()
Out[26]:
In [33]:
dfx[['description','num_pages','title']] = dfx.isbn.apply(get_details)
In [40]:
dfx[dfx['title'] == 'none'].shape
Out[40]:
There are 34 records that still remain with no information from the API. We shall remove these records from our dataset as they form a negligible portion of our sample of books.
In [19]:
for i, row in dfx.iterrows():
isbn_df.loc[i] = dfx.loc[i]
In [42]:
# Checking if the newly created dataset contains the same number of empty records as in dfx
isbn_df[isbn_df['title'] == 'none'].shape == dfx[dfx['title'] == 'none'].shape
Out[42]:
In [43]:
# dfx = isbn_df[isbn_df['title'] == 'none']
In [39]:
isbn_df.to_pickle('ibsn_features_new_batch.pickle')