In [200]:
"""Required imports"""
import json
import urllib
from bs4 import BeautifulSoup
In [201]:
# We have to generate the webpage dynamically since it's mostly JS.
from contextlib import closing
from selenium.webdriver import Chrome # pip install selenium
from selenium.webdriver.support.ui import WebDriverWait
In [202]:
class FailedBookRetrievalError(Exception):
def __init__(self):
super(FailedBookRetrievalError, self).__init__()
class FailedAuthorRetrievalError(Exception):
def __init__(self):
super(FailedAuthorRetrievalError, self).__init__()
class Book(object):
def __init__(self, title, author, gr_id):
self.title = title.encode('ascii', 'ignore')
self.author = author
self.id = gr_id
self.href = 'https://www.goodreads.com/book/show/%s' % self.id
def __eq__(self, other):
return self.id == other.id
def __str__(self):
return "%s by %s" % (self.title, self.author.name)
def __hash__(self):
return hash(str(self.id))
def __repr__(self):
return "%s: %s" % (str(self), self.href)
class Author(object):
def __init__(self, name, gr_id):
self.name = name.encode('ascii', 'ignore')
self.id = gr_id
self.href = 'https://www.goodreads.com/author/show/%s' % self.id
def __eq__(self, other):
return self.id == other.id
def __str__(self):
return self.name
def __hash__(self):
return hash(str(self.id))
def __repr__(self):
return "%s: %s" % (str(self), self.href)
In [203]:
def extract_author(anchor):
'''
Given an anchor .authorName extracts an Author object.
'''
try:
gr_id = anchor['href'].split('/')[-1]
name = anchor.find('span').contents[0]
except:
raise FailedAuthorRetrievalError(anchor)
return Author(name=name, gr_id=gr_id)
def extract_book(div):
'''
Given an .elementList div for a book, returns a Book object.
'''
try:
anchor_tag = div.find('a', {'class': 'bookTitle'})
title = anchor_tag.contents[0]
gr_id = anchor_tag['href'].split('/')[-1]
except:
raise FailedBookRetrievalError(anchor_tag)
try:
author_tag = div.find('a', {'class': 'authorName'})
author = extract_author(author_tag)
except:
author = None
return Book(title=title, author=author, gr_id=gr_id)
In [204]:
def get_books(html):
'''
Extracts a set of Books from the html raw source if possible.
'''
soup = BeautifulSoup(html, 'html.parser')
result_list = soup.find('div', { 'class': 'leftContainer'})
try:
results = result_list.find_all('div', {'class': 'elementList'})
except:
print "Failed to find any results on page."
return set()
books = []
for res in results:
try:
books.append(extract_book(res))
except FailedBookRetrievalError:
print "Failed to extract book from %s" % res
pass
return set(books)
In [205]:
def get_raw_html(browser, shelf, page=1):
'''
Fetches the raw html for the page corresponding to `shelf`
and `page` number.
'''
service_url = 'https://www.goodreads.com/shelf/show/'
params = {
'page': page
}
url = service_url + shelf + '?' + urllib.urlencode(params)
browser.get(url)
return browser.page_source
In [206]:
def get_shelf(browser, term, n = 100):
'''
Given a browser and a shelf name, returns a list
of n `Book`s as determined by Good Reads.
'''
page = 1
books = set()
failed_sequence = 0
while len(books) < n:
html = get_raw_html(browser, term, page)
books_in_page = get_books(html)
if len(books_in_page) == 0: # no new books.
failed_sequence += 1
else:
failed_sequence = 0
if failed_sequence > 10:
break
books = books.union(books_in_page)
page += 1
return books
In [207]:
def login(browser, email, password):
'''
Login to GoodReads so we can access more data.
'''
url = 'https://www.goodreads.com/user/sign_in'
browser.get(url)
browser.find_element_by_id('user_email').send_keys(email)
browser.find_element_by_id('user_password').send_keys(password)
browser.find_element_by_name('next').click()
In [208]:
import pickle
def dump_book_set(s, filename):
with open(filename, 'w+') as handle:
pickle.dump(s, handle)
In [209]:
def find_books(tags, n = 500):
'''
Given a list of tags, finds their intersection by looking
at the top n books in each tag and intersecting the resulting
sets.
Returns:
A set of Book objects. It may be helpful to run something like:
for book in book:
print book
'''
book_set = set()
with closing(Chrome('./chromedriver')) as browser:
login(browser, 'luis.perez.live@gmail.com', 'luis3137')
for tag in tags:
shelf = get_shelf(browser, tag, n = n)
dump_book_set(shelf, "%s_n=%s_set.pk" % (tag, n))
book_set = book_set.intersection(shelf)
print "Finished collecting data for tag %s." % tag
return book_set
In [216]:
books = find_books(['adult'], n = 100000)
In [212]:
for book in books:
print book.href + " author: " + str(book.author)
In [214]:
# Load all of the results and intersect.
a, b, c, d = (pickle.load(open('strong-heroine_n=100000_set.pk')),
pickle.load(open('romance_n=100000_set.pk')),
pickle.load(open('female-lead_n=100000_set.pk')),
pickle.load(open('young-adult_n=100000_set.pk')))
In [215]:
for book in set.intersection(a,b,c,d):
print book.href + " author: " + str(book.author)
In [ ]:
# Good reads data pulling.
def get_shelf(shelf_name):
service_url = 'https://www.goodreads.com/shelf/show/'
url = service_url + '?' + shelf_name
response = urllib.urlopen(url).read()
return response
In [ ]:
get_shelf("fiction")
In [ ]: