In [1]:
from IPython.core.display import HTML
print("Setting custom CSS for the IPython Notebook")
styles = open('custom.css', 'r').read()
HTML(styles)
Out[1]:
In [2]:
## all imports
import numpy as np
import urllib2
import bs4 #this is beautiful soup
from pandas import Series
import pandas as pd
from pandas import DataFrame
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_context("talk")
sns.set_style("white")
http://developer.rottentomatoes.com/member/register
https://apps.twitter.com/app/new
In [3]:
s = """<!DOCTYPE html>
<html>
<head>
<title>This is a title</title>
</head>
<body>
<h2> Test </h2>
<p>Hello world!</p>
</body>
</html>"""
h = HTML(s)
h
Out[3]:
In [4]:
url = 'http://www.crummy.com/software/BeautifulSoup'
source = urllib2.urlopen(url).read()
print source
In [5]:
## is 'Alice' in source?
## count occurences of 'Soup'
## find index of 'alien video games'
In [6]:
## get bs4 object
soup = bs4.BeautifulSoup(source)
## show prettify()
## show how to find all a tags
## ***Why does this not work? ***
#soup.findAll('Soup')
In [7]:
## get attribute value from an element:
## find tag
## get attribute
##get all links in the page
##filter all external links
In [8]:
s = """<!DOCTYPE html><html><head><title>This is a title</title></head><body><h3> Test </h3><p>Hello world!</p></body></html>"""
## get bs4 object
tree = bs4.BeautifulSoup(s)
## get html root node
## get head from root using contents
## get body from root
## could directly access body
In [9]:
## get h3 tag from body
In [10]:
## use ul as entry point
## get hall of fame list from entry point
## reformat into a list
## maybe show some advanced python
In [11]:
stuff_i_like = ['burger', 'sushi', 'sweet potato fries', 'BBQ']
found_happy_hours = []
my_happy_hours = []
# First, I'm going to identify the areas of the page I want to look at
url = 'http://www.downtownla.com/3_10_happyHours.asp?action=ALL'
source = urllib2.urlopen(url).read()
tables = bs4.BeautifulSoup(source)
In [12]:
# Then, I'm going to sort out the *exact* parts of the page
# that match what I'm looking for...
for t in tables.findAll('p', {'class': 'calendar_EventTitle'}):
text = t.text
for s in t.findNextSiblings():
text += '\n' + s.text
found_happy_hours.append(text)
print "The scraper found %d happy hours!" % len(found_happy_hours)
In [13]:
# Now I'm going to loop through the food I like
# and see if any of the happy hour descriptions match
for food in stuff_i_like:
for hh in found_happy_hours:
# checking for text AND making sure I don't have duplicates
if food in hh and hh not in my_happy_hours:
print "YAY! I found some %s!" % food
my_happy_hours.append(hh)
print "I think you might like %d of them, yipeeeee!" % len(my_happy_hours)
In [14]:
# Now, let's make a mail message we can read:
message = 'Hey Katharine,\n\n\n'
message += 'OMG, I found some stuff for you in Downtown, take a look.\n\n'
message += '==============================\n'.join(my_happy_hours)
message = message.encode('utf-8')
# To read more about encoding:
# http://diveintopython.org/xml_processing/unicode.html
message = message.replace('\t', '').replace('\r', '')
message += '\n\nXOXO,\n Your Py Script'
#print message
In [15]:
import json
import secret
import requests
api_key = secret.rottenTomatoes_key()
url = 'http://api.rottentomatoes.com/api/public/v1.0/lists/dvds/top_rentals.json?apikey=' + api_key
data = urllib2.urlopen(url).read()
#print data
In [16]:
a = {'a': 1, 'b':2}
print a
#show keys
#show values
#show for loop over all entries
#explicit, zipped, iteritems
In [17]:
a = {'a': 1, 'b':2}
s = json.dumps(a)
a2 = json.loads(s)
In [18]:
#create dictionary from JSON
dataDict = json.loads(data)
#expore dictionary
#filter movies
#find critics score
In [19]:
# critics scores list
# audience scores list
In [20]:
# create pandas data frame with critics and audience score
# first create numpy array
# then create DataFrame with data and columns
# also create a list with all movie titles
# set index of dataFrame BEWARE of inplace!
In [21]:
# create a bar plot with the data
# set the title to Score Comparison
# set the x label
# set the y label
# show the plot
In [22]:
import twitter
# define the necessary keys
cKey = secret.twitterAPI_key()
cSecret = secret.twitterAPI_secret()
aKey = secret.twitterAPI_access_token_key()
aSecret = secret.twitterAPI_access_token_secret()
# create the api object
api = twitter.Api(consumer_key=cKey, consumer_secret=cSecret, access_token_key=aKey, access_token_secret=aSecret)
In [23]:
# get the user timeline with screen_name = 'rafalab'
# create a data frame
# first get a list of panda Series or dict
# then create the data frame
In [24]:
# filter tweets with enough retweet_count
# print the text of these tweets
In [25]:
# create a view for favorite_count on maybe_interesting
# change a value
# look at original frame
# do it again but this time with copy
remember Rafael's nice illustration last week
some more python details
http://cs109.github.io/2014/pages/lectures/04-distance.html#/11
In [26]:
import scipy
np.random.seed(seed=99)
# make some data up
mean = [0,0]
cov = [[1.0,0.7],[0.7,1.0]]
x,y = np.random.multivariate_normal(mean,cov,500).T
In [27]:
# plot the data
fig = plt.figure()
plt.scatter(x,y)
plt.axis('equal')
plt.show()
In [28]:
# create a data matrix
matrix = np.column_stack((x,y))
# compute SVD
U,s,Vh = scipy.linalg.svd(matrix)
# blow up s
S = scipy.linalg.diagsvd(s, 500, 2)
# reconstruct the data (sanity test)
reconstruction = np.dot(U, np.dot(S, Vh))
print matrix[1,:]
print reconstruction[1,:]
print np.allclose(matrix, reconstruction)
In [29]:
# show the column vectors of V
V = Vh.T
plt.scatter(x, y)
plt.plot([0, V[0,0]], [0,V[1,0]], c='r', linewidth=10.0)
plt.plot([0, V[0,1]], [0,V[1,1]], c='y', linewidth=10.0)
plt.axis('equal')
plt.show()
In [30]:
# two ways to project the data
projection = np.dot(U, S[:,:1])
projection2 = np.dot(matrix, V[:,:1])
np.allclose(projection, projection2)
Out[30]:
In [31]:
# compare the plots
plt.clf()
zeros = np.zeros_like(projection)
plt.scatter(projection, zeros, c='r', zorder=2)
plt.scatter(x,y,c='b', zorder=2)
for px, py, proj in zip(x,y,projection):
plt.plot([px,proj],[py,0],c='0.5', linewidth=1.0, zorder=1)
plt.axis('equal')
plt.show()
In [32]:
# try to reconstruct back to 2D
# just a reminder
projection = np.dot(U, S[:,:1])
# now the reconstruction
reconstruction = np.dot(projection, Vh[:1,:])
reconstruction.shape
Out[32]:
In [33]:
# compare the plots
plt.clf()
zeros = np.zeros_like(projection)
plt.scatter(reconstruction[:,0], reconstruction[:,1], c='r', zorder=2)
plt.scatter(x,y,c='b', zorder=2)
for px, py, rx,ry in zip(x,y,reconstruction[:,0],
reconstruction[:,1]):
plt.plot([px,rx],[py,ry],c='0.5', linewidth=1.0, zorder=1)
plt.axis('equal')
plt.show()