Tanya Schlusser 11 December 2014
Slides prepared using iPython Notebook. (Awesome quick tutorial... and how to 'Markdown')
Following along? Clone this: github/tanyaschlusser/ipython_talk__OReilly_python_books
In [1]:
# See what's out there now. Pull the:
# -- media type (book | video)
# -- title
# -- publication date
import requests
from bs4 import BeautifulSoup
books_uri = "http://shop.oreilly.com/category/browse-subjects/programming/python.do?sortby=publicationDate&page=%d"
In [2]:
# Loop over all of the pages
results = []
description_results = {}
for page in range(1,5):
result = requests.get(books_uri % page)
soup = BeautifulSoup(result.text)
books = soup.find_all("td", "thumbtext")
for b in books:
yr = b.find("span", "directorydate").string.strip().split()
while not yr[-1].isdigit():
yr.pop()
yr = int(yr[-1])
title = b.find("div", "thumbheader").text.strip()
url = b.find("div", "thumbheader").find("a")["href"]
hasvideo = "Video" in b.text
results.append(dict(year=yr, title=title, hasvideo=hasvideo))
In [3]:
# Want to
# -- plot year over year number of books
# ++ stacked plot with video + print
# -- Get all the different words in the titles
# ++ count them
# ++ and order by frequency
#
# Use the Matplotlib magic command. Magic commands start with '%'.
# This sets up to plot inline. It doesn't import anything...
# Or use %pylab inline -- this apparently imports a lot of things into
# the global namespace
#
%matplotlib inline
In [18]:
# For year over year I need pandas.DataFrame.groupby
# For stacked plot I need matplotlib.pyplot
# Plain dictionary for the word counts
#
import pylab
import matplotlib.pyplot as plt
import pandas as pd
In [19]:
# Year over year -- number of publications by 'video' and 'print'.
#
df = pd.DataFrame(results)
byyear = pd.crosstab(df["year"],df["hasvideo"])
byyear.rename(columns={True:'video', False:'print'}, inplace=True)
byyear.plot(kind="area", xlim=(2000,2014), title="Ever increasing publications")
Out[19]:
In [6]:
# Out of curiosity, what happened in 2010?
df[df["year"]==2010]
Out[6]:
In [7]:
# Break up the titles and count words in the titles.
# -- Need a regex for the punctuation (commas and colons)
# -- Need a stemmer for plurals, posessives, and verb conjugations
import re
from nltk.stem.porter import PorterStemmer
space_or_punct = re.compile("[\s,:\.]+")
stemmer = PorterStemmer()
title_words = {}
for r in results:
title = space_or_punct.split( r["title"].lower() )
stemmed_title = (stemmer.stem(t) for t in title)
for t in stemmed_title:
# don't retain version or release numbers
if not t[0].isdigit():
if t not in title_words:
title_words[t] = 1
else:
title_words[t] += 1
print "Total distinct words in the titles:", len(title_words), "\n"
print "\t".join(title_words.keys())
print"\n"
print "\n".join(r["title"] for r in results if not r["hasvideo"])
In [8]:
# That was useless -- almost every word except for "Python" and "Learn"
# shows up only in one title. Lame.
#
# Loop over all of the pages and get the book descriptions.
# Maybe see if some things in the descriptions show common topics
# (hoping for 'introductory' or 'web development' or 'machine learning')
import nltk
# Before running the below you need to do nltk.download() and select 'stopwords' from the corpus
from nltk.corpus import stopwords
english_stops = stopwords.words("English")
description_results = {}
for page in range(1,5):
result = requests.get(books_uri % page)
soup = BeautifulSoup(result.text)
books = soup.find_all("td", "thumbtext")
for b in books:
title = b.find("div", "thumbheader").text.strip()
# Only look at the books
if not "Video" in b.text:
print ".",
url = "http://shop.oreilly.com" + b.find("div", "thumbheader").find("a")["href"]
result2 = requests.get(url)
soup2 = BeautifulSoup(result2.text)
description = soup2.find("div", "detail-description-content").text
description_results[title] = set([stemmer.stem(word)
for word in space_or_punct.split(description.lower())
if word not in english_stops])
print
In [9]:
# Try clustering:
# -- Distance between two book descriptions is the percent overlap
# of words in both their descriptions
# -- Arbitrarily (from qualitative looking) decide on a threshold of
# 17% overlap in descriptions for both books to be 'similar'
# and look at what we get
percent_overlap = []
min_intersections = []
max_intersections = []
avg_intersections = []
similar_books = {}
sorted_titles = sorted(description_results.keys())
for i in range(len(sorted_titles)):
this_description = description_results[sorted_titles[i]]
def get_percent_overlap(title):
intersection_size = len(this_description.intersection(description_results[title]))
union_size = len(this_description.union(description_results[t]))
return (intersection_size * 100) / union_size
percent_overlap.append([get_percent_overlap(t) for t in sorted_titles])
similar_books[sorted_titles[i]] = [
t for t in sorted_titles
if get_percent_overlap(t) > 17 and t != sorted_titles[i]
]
min_intersections.append(round(min(percent_overlap[-1])))
max_intersections.append(round(max(percent_overlap[-1])))
avg_intersections.append(round(100 * sum(percent_overlap[-1]) / len(sorted_titles)))
print "\n".join("\n%s\n%s" % (k, "|".join(v)) for k, v in similar_books.iteritems())
In [10]:
from scipy.cluster.hierarchy import linkage, dendrogram
In [17]:
plt.figure(figsize=(5,20))
data_link = linkage(percent_overlap, method='single', metric='euclidean')
den = dendrogram(data_link,labels=sorted_titles, orientation="left")
plt.ylabel('Samples', fontsize=9)
plt.xlabel('Distance')
plt.suptitle('Books clustered by description similarity', fontweight='bold', fontsize=14);
Out[17]:
In [20]:
# The output of linkage is uninterpretable for a human.
# Nest them in a JSON for readability
# print data_link
human_links = dict(enumerate(sorted_titles))
index = len(human_links)
for link in data_link:
left = link[0]
right = link[1]
human_links[index] = {left:human_links[left], right:human_links[right]}
del(human_links[left], human_links[right])
index += 1
import json
with open("data/hclust.json", "w") as outfile:
outfile.write(json.dumps(human_links))
#print human_links
In [23]:
from IPython.display import HTML
container = """
<script type="text/javascript" src="http://d3js.org/d3.v3.min.js"></script>
<a href='http://bl.ocks.org/mbostock/4063570'>Attribution: Michael Bostock</a>
<div id='display_container'></div>"""
with open("data/d3-stacked-Tree.js") as infile:
display_js = infile.read()
with open("data/human_hclust.json") as infile:
the_json = infile.read()
HTML(container + display_js % the_json )
Out[23]:
It makes sense that topics have little overlap. Otherwise why write a different book? Themes: