In [ ]:
%matplotlib inline
from collections import defaultdict
import json
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl
#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
(0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
(0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
(0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
(0.4, 0.6509803921568628, 0.11764705882352941),
(0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
(0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]
rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
#rcParams['font.family'] = 'sans-serif'
#rcParams['font.sans-serif'] = 'Helvetica'
def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
"""
Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
"""
ax = axes or plt.gca()
ax.spines['top'].set_visible(top)
ax.spines['right'].set_visible(right)
ax.spines['left'].set_visible(left)
ax.spines['bottom'].set_visible(bottom)
#turn off all ticks
ax.yaxis.set_ticks_position('none')
ax.xaxis.set_ticks_position('none')
#now re-enable visibles
if top:
ax.xaxis.tick_top()
if bottom:
ax.xaxis.tick_bottom()
if left:
ax.yaxis.tick_left()
if right:
ax.yaxis.tick_right()
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
#utility functions
def histogram_style():
remove_border(left=False)
plt.grid(False)
plt.grid(axis='y', color='w', linestyle='-', lw=1)
def histogram_labels(xlabel, ylabel, title, loc):
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.title(title)
plt.legend(frameon=False, loc=loc)
def histogram_settings(xlabel, ylabel, title, loc = 'upper left'):
histogram_style()
histogram_labels(xlabel, ylabel, title, loc)
In [ ]:
from matplotlib.colors import ListedColormap
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
def points_plot(ax, Xtr, Xte, ytr, yte, clf, mesh=True, colorscale=cmap_light, cdiscrete=cmap_bold, alpha=0.1, psize=10, zfunc=False, predicted=False):
h = .02
X=np.concatenate((Xtr, Xte))
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
#plt.figure(figsize=(10,6))
if zfunc:
p0 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 0]
p1 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z=zfunc(p0, p1)
else:
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
ZZ = Z.reshape(xx.shape)
if mesh:
plt.pcolormesh(xx, yy, ZZ, cmap=cmap_light, alpha=alpha, axes=ax)
if predicted:
showtr = clf.predict(Xtr)
showte = clf.predict(Xte)
else:
showtr = ytr
showte = yte
ax.scatter(Xtr[:, 0], Xtr[:, 1], c=showtr-1, cmap=cmap_bold, s=psize, alpha=alpha,edgecolor="k")
# and testing points
ax.scatter(Xte[:, 0], Xte[:, 1], c=showte-1, cmap=cmap_bold, alpha=alpha, marker="s", s=psize+10)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
return ax,xx,yy
def points_plot_prob(ax, Xtr, Xte, ytr, yte, clf, colorscale=cmap_light, cdiscrete=cmap_bold, ccolor=cm, psize=10, alpha=0.1):
ax,xx,yy = points_plot(ax, Xtr, Xte, ytr, yte, clf, mesh=False, colorscale=colorscale, cdiscrete=cdiscrete, psize=psize, alpha=alpha, predicted=True)
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=ccolor, alpha=.2, axes=ax)
cs2 = plt.contour(xx, yy, Z, cmap=ccolor, alpha=.6, axes=ax)
plt.clabel(cs2, fmt = '%2.1f', colors = 'k', fontsize=14, axes=ax)
return ax
In [ ]:
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
"""
Function
--------
def cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=5)
Performs cross validation of n_folds using a list of values for a chosen parameter (say regularization parameter, C) and returns
the parameter with the best performance
Parameters
----------
clf : Model (eg: LogisticRegression())
parameters: List of parameter values to test eg. {"C": [0.01, 0.1, 1, 10, 100]}
Xtrain: Training features
ytrain: Training labels
n_folds: No of cross validation folds eg. 5
Returns
-------
best: Best parameter from the values given
"""
def cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=5):
gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds)
gs.fit(Xtrain, ytrain)
print "BEST PARAMS", gs.best_params_
best = gs.best_estimator_
return best
"""
Function
--------
def do_classify(clf, parameters, indf, featurenames, targetname, target1val, standardize=False, train_size=0.8)
Trains a model and computes the accuracy on train and test data. Sub-tasks included:
1. Subset data frame by the given features
2. Split in to train and test data
3. Cross validation using cv_optimize
4. Fit the model
5. Compute and print train and test accuracy
Parameters
----------
clf : Model (eg: LogisticRegression())
parameters: List of parameter values to test eg. {"C": [0.01, 0.1, 1, 10, 100]}
indf: data frame
featurenames: List of feature names
targetname: Name of prediction variable
target1val: Value of prediction variable
Returns
-------
clf, Xtrain, ytrain, Xtest, ytest
"""
def do_classify(clf, parameters, indf, featurenames, targetname, target1val, standardize=False, train_size=0.8):
subdf=indf[featurenames]
if standardize:
subdfstd=(subdf - subdf.mean())/subdf.std()
else:
subdfstd=subdf
X=subdfstd.values
y=(indf[targetname].values==target1val)*1
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=train_size)
clf = cv_optimize(clf, parameters, Xtrain, ytrain)
clf=clf.fit(Xtrain, ytrain)
training_accuracy = clf.score(Xtrain, ytrain)
test_accuracy = clf.score(Xtest, ytest)
print "Accuracy on training data: %0.2f" % (training_accuracy)
print "Accuracy on test data: %0.2f" % (test_accuracy)
return clf, Xtrain, ytrain, Xtest, ytest
In [ ]:
import simplejson as json
with open('./data/us_state_map.geojson','r') as fp:
statedata = json.load(fp)
with open('./data/us_county_map.geojson','r') as fp:
data = json.load(fp)
In [1]:
#your code here
def get_senate_vote(vote):
url = 'https://www.govtrack.us/data/congress/113/votes/2013/s'+str(vote)+'/data.json'
vote_data = requests.get(url)
return json.loads(vote_data.text)
In [2]:
#Your code here
def get_all_votes():
url = 'http://www.govtrack.us/data/congress/113/votes/2013'
response = requests.get(url)
soup = bs(response.content, "lxml")
s_votes = [a['href'][1:-1] for a in soup.find_all('a')
if a['href'].startswith('s')]
return [get_senate_vote(vote) for vote in s_votes]
In [ ]:
fulldf=pd.read_csv("bigdf.csv")
The following function is used to re-compute review counts and averages whenever you subset a reviews data frame. We'll use it soon to construct a smaller, more computationally tractable data frame.
In [ ]:
def recompute_frame(ldf):
"""
takes a dataframe ldf, makes a copy of it, and returns the copy
with all averages and review counts recomputed
this is used when a frame is subsetted.
"""
ldfu=ldf.groupby('user_id')
ldfb=ldf.groupby('business_id')
user_avg=ldfu.stars.mean()
user_review_count=ldfu.review_id.count()
business_avg=ldfb.stars.mean()
business_review_count=ldfb.review_id.count()
nldf=ldf.copy()
nldf.set_index(['business_id'], inplace=True)
nldf['business_avg']=business_avg
nldf['business_review_count']=business_review_count
nldf.reset_index(inplace=True)
nldf.set_index(['user_id'], inplace=True)
nldf['user_avg']=user_avg
nldf['user_review_count']=user_review_count
nldf.reset_index(inplace=True)
return nldf
In [ ]:
smalldf = fulldf[(fulldf.business_review_count > 150) & (fulldf.user_review_count > 60)]
smalldf_new = recompute_frame(smalldf)
The common support is an important concept, as for each pair of restaurants, its the number of people who reviewed both. It will be used to modify similarity between restaurants. If the common support is low, the similarity is less believable.
In [ ]:
restaurants=smalldf.business_id.unique() #get all the unique restaurant ids
supports=[]
for i,rest1 in enumerate(restaurants): # first restaurant (rest1) in the pair
for j,rest2 in enumerate(restaurants): # second restaurant(rest2) in the pair
if i < j: #skip pairing same restaurants and forming duplicate pairs
rest1_reviewers = smalldf[smalldf.business_id==rest1].user_id.unique() #find all unique users who reviewed restaurant 1
rest2_reviewers = smalldf[smalldf.business_id==rest2].user_id.unique() #find all unique users who reviewed restaurant 2
common_reviewers = set(rest1_reviewers).intersection(rest2_reviewers) # find common reviewers by taking intersection
supports.append(len(common_reviewers)) # add the no of common reviewers to list
print "Mean support is:",np.mean(supports)
plt.hist(supports) #plot hist of list
histogram_style()
In [ ]:
# test business id
testbizid="eIxSLxzIlfExI6vgAbn2JA"
In [ ]:
# Return business name from a df given id
def biznamefromid(df, theid):
return df['biz_name'][df['business_id']==theid].values[0]
In [ ]:
print testbizid, biznamefromid(smalldf,testbizid)
In [ ]:
predictwise = pd.read_csv('data/predictwise.csv').set_index('States')
predictwise.head(10)
In [ ]:
#Your code here
def simulate_election(model, n_sim):
simulations = np.random.uniform(size= (51,n_sim))
obama_votes = (simulations < model.Obama.values.reshape(-1,1))* model.Votes.values.reshape(-1,1)
results = obama_votes.sum(axis = 0)
return results
In [ ]:
result = simulate_election(predictwise, 10000)
result
In [ ]:
#your code here
def plot_simulation(simulation):
plt.hist(simulation, bins=np.arange(200, 538, 1),
label='simulations', align='left', normed=True)
plt.axvline(332, 0, .5, color='r', label='Actual Outcome')
plt.axvline(269, 0, .5, color='k', label='Victory Threshold')
p05 = np.percentile(simulation, 5.)
p95 = np.percentile(simulation, 95.)
iq = int(p95 - p05)
pwin = ((simulation >= 269).mean() * 100)
plt.title("Chance of Obama Victory: %0.2f%%, Spread: %d votes" % (pwin, iq))
plt.legend(frameon=False, loc='upper left')
plt.xlabel("Obama Electoral College Votes")
plt.ylabel("Probability")
remove_border()
In [ ]:
plot_simulation(result)
plt.xlim(240,380)
In [ ]:
# import data
critics = pd.read_csv('critics.csv') # Rotten Tomatoes Top Critics Reviews Data for about 3000 movies
# clean data
critics = critics[~critics.quote.isnull()]
critics = critics[critics.fresh.notnull()]
critics = critics[critics.quote.str.len() > 0]
critics = critics.reset_index(drop = True)
critics.head()
What does the distribution of number of reviews per reviewer look like?
In [ ]:
plt.hist(critics.groupby('critic').rtid.count(),log = True, bins=range(20), edgecolor='white')
plt.xlabel("Number of reviews per critic")
plt.ylabel("N")
histogram_style()
Of the critics with > 100 reviews, plot the distribution of average "freshness" rating per critic
In [ ]:
df = critics.copy()
df['fresh'] = df.fresh == 'fresh'
grp = df.groupby('critic')
counts = grp.rtid.count()
means = grp.fresh.mean()
means[counts > 100].hist(bins=10, edgecolor='w', lw=1)
plt.xlabel("Average rating per critic")
plt.ylabel("N")
plt.yticks([0, 2, 4, 6, 8, 10])
histogram_style()
Using the original movies dataframe, plot the rotten tomatoes Top Critics Rating as a function of year. Overplot the average for each year, ignoring the score=0 examples (some of these are missing data).
In [ ]:
# Get the data
from io import StringIO
movie_txt = requests.get('https://raw.github.com/cs109/cs109_data/master/movies.dat').text
movie_file = StringIO(movie_txt) # treat a string like a file
movies = pd.read_csv(movie_file, delimiter='\t')
movies = movies.dropna()
movies = movies.reset_index(drop = True)
# process data
data = movies[['year','rtTopCriticsRating']]
#data = data.convert_objects(convert_numeric=True) #deprecated
data = data.apply(pd.to_numeric)
data = data[(data.rtTopCriticsRating > 0.00)]
means = data.groupby('year').mean()
# plotting
plt.plot(data['year'], data['rtTopCriticsRating'], 'o', mec = 'none', alpha = .5, label = 'Data' )
plt.plot(means.index, means['rtTopCriticsRating'], '-', label = 'Yearly Average' )
plt.legend(loc ='lower left', frameon = False)
plt.xlabel("Year")
plt.ylabel("Average Score")
remove_border()
This graph shows a trend towards a lower average score, as well as a greater abundance of low scores, with time. This is probably at least partially a selection effect -- Rotten Tomatoes probably doesn't archive reviews for all movies, especially ones that came out before the website existed. Thus, reviews of old movies are more often "the classics". Mediocre old movies have been partially forgotten, and are underrepresented in the data. Other questions worth exploring: Does the down trend mean:
In [ ]:
df=pd.read_csv("01_heights_weights_genders.csv")
df.head()
In [ ]:
from sklearn.linear_model import LogisticRegression
clf_l, Xtrain_l, ytrain_l, Xtest_l, ytest_l = do_classify(LogisticRegression(), {"C": [0.01, 0.1, 1, 10, 100]}, df, ['Weight', 'Height'], 'Gender','Male')
In [ ]:
plt.figure(figsize =(12,8))
plt.xlabel("Weight")
plt.ylabel("Height")
ax=plt.gca()
points_plot(ax, Xtrain_l, Xtest_l, ytrain_l, ytest_l, clf_l, alpha=0.2);
Let us plot the probabilities obtained from predict_proba, overlayed on the samples with their true labels:
In [ ]:
plt.figure(figsize =(12,8))
plt.grid(True)
ax=plt.gca()
points_plot_prob(ax, Xtrain_l, Xtest_l, ytrain_l, ytest_l, clf_l, psize=20, alpha=0.1);
In [ ]:
from sklearn.lda import LDA
clflda = LDA(solver="svd", store_covariance=True)
clflda.fit(Xtrain_l, ytrain_l)
In [ ]:
#from REF
from scipy import linalg
def plot_ellipse(splot, mean, cov, color):
v, w = linalg.eigh(cov)
u = w[0] / linalg.norm(w[0])
angle = np.arctan(u[1] / u[0])
angle = 180 * angle / np.pi # convert to degrees
# filled Gaussian at 2 standard deviation
ell = mpl.patches.Ellipse(mean, 2 * v[0] ** 0.5, 2 * v[1] ** 0.5,
180 + angle, color=color, lw=3, fill=False)
ell.set_clip_box(splot.bbox)
ell1 = mpl.patches.Ellipse(mean, 1 * v[0] ** 0.5, 1 * v[1] ** 0.5,
180 + angle, color=color, lw=3, fill=False)
ell1.set_clip_box(splot.bbox)
ell3 = mpl.patches.Ellipse(mean, 3 * v[0] ** 0.5, 3 * v[1] ** 0.5,
180 + angle, color=color, lw=3, fill=False)
ell3.set_clip_box(splot.bbox)
#ell.set_alpha(0.2)
splot.add_artist(ell)
splot.add_artist(ell1)
splot.add_artist(ell3)
#splot.set_xticks(())
#splot.set_yticks(())
def plot_lda_cov(lda, splot):
plot_ellipse(splot, lda.means_[0], lda.covariance_, 'red')
plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue')
#plt.bivariate_normal(X, Y, sigmax=1.0, sigmay=1.0, mux=0.0, muy=0.0, sigmaxy=0.0)¶
In [ ]:
plt.figure(figsize =(12,8))
ax=plt.gca()
spl,_,_=points_plot(ax,Xtrain_l, Xtest_l, ytrain_l, ytest_l, clflda)
plot_lda_cov(clflda, spl)
In [ ]:
dfd=pd.read_csv("https://dl.dropboxusercontent.com/u/75194/pima.csv")
dfd.head()
In [ ]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
features=['npreg', 'bmi','diaped', 'age', 'glconc','insulin']
X=dfd[features].values
y=dfd.diabetes.values
from sklearn.naive_bayes import GaussianNB
clf=GaussianNB()
itr,ite=train_test_split(range(y.shape[0]))
Xtr=X[itr]
Xte=X[ite]
ytr=y[itr]
yte=y[ite]
clf.fit(Xtr,ytr)
print "Frac of mislabeled points",float((yte != clf.predict(Xte)).sum())/yte.shape[0]
confusion_matrix(clf.predict(Xte),yte)
In [ ]:
def calibration_plot(clf, xtest, ytest):
prob = clf.predict_proba(xtest)[:, 1]
outcome = ytest
data = pd.DataFrame(dict(prob=prob, outcome=outcome))
#group outcomes into bins of similar probability
bins = np.linspace(0, 1, 20)
cuts = pd.cut(prob, bins)
binwidth = bins[1] - bins[0]
#freshness ratio and number of examples in each bin
cal = data.groupby(cuts).outcome.agg(['mean', 'count'])
cal['pmid'] = (bins[:-1] + bins[1:]) / 2
cal['sig'] = np.sqrt(cal.pmid * (1 - cal.pmid) / cal['count'])
#the calibration plot
ax = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
p = plt.errorbar(cal.pmid, cal['mean'], cal['sig'])
plt.plot(cal.pmid, cal.pmid, linestyle='--', lw=1, color='k')
plt.ylabel("Empirical Fraction")
#the distribution of P(fresh)
ax = plt.subplot2grid((3, 1), (2, 0), sharex=ax)
#calsum = cal['count'].sum()
plt.bar(left=cal.pmid - binwidth / 2, height=cal['count'],
width=.95 * (bins[1] - bins[0]),
fc=p[0].get_color())
plt.xlabel("Classifier Probability")
In [ ]:
calibration_plot(clf, Xte, yte)
In [ ]:
plt.figure(figsize=(11, 9))
import scipy.stats as stats
beta = stats.beta
n_trials = [0, 1, 2, 3, 4, 5, 8, 15, 50, 500]
data = stats.bernoulli.rvs(0.5, size=n_trials[-1])
x = np.linspace(0, 1, 100)
# For the already prepared, I'm using Binomial's conj. prior.
for k, N in enumerate(n_trials):
sx = plt.subplot(len(n_trials) / 2, 2, k + 1)
plt.xlabel("$p$, probability of heads") \
if k in [0, len(n_trials) - 1] else None
plt.setp(sx.get_yticklabels(), visible=False)
heads = data[:N].sum()
#posterior distribution.
y = beta.pdf(x, 1 + heads, 1 + N - heads)
plt.plot(x, y, label="observe %d tosses,\n %d heads" % (N, heads))
plt.fill_between(x, 0, y, color="#348ABD", alpha=0.4)
plt.vlines(0.5, 0, 4, color="k", linestyles="--", lw=1)
leg = plt.legend()
leg.get_frame().set_alpha(0.4)
plt.autoscale(tight=True)
plt.suptitle("Bayesian updating of posterior probabilities",
y=1.02,
fontsize=14)
plt.tight_layout()
In [ ]:
Y = [16.4, 17.0, 17.2, 17.4, 18.2, 18.2, 18.2, 19.9, 20.8]
# Prior mean
mu_prior = 19.5
# prior std
tau = 10
N = 15000
In [ ]:
#Data Quantities
sig = np.std(Y) # assume that is the value of KNOWN sigma (in the likelihood)
mu_data = np.mean(Y)
n = len(Y)
kappa = sig**2 / tau**2
sig_post =np.sqrt(1./( 1./tau**2 + n/sig**2));
# posterior mean
mu_post = kappa / (kappa + n) *mu_prior + n/(kappa+n)* mu_data
#samples
theta_prior = np.random.normal(loc=mu_prior, scale=tau, size=N);
theta_post = np.random.normal(loc=mu_post, scale=sig_post, size=N);
In [ ]:
plt.hist(theta_post, bins=30, alpha=0.9, label="posterior");
plt.hist(theta_prior, bins=30, alpha=0.2, label="prior");
plt.xlim([10, 30])
plt.xlabel("wing length (mm)")
plt.ylabel("Number of samples")
plt.legend();
In [ ]: