Chart Samples



In [ ]:

    
%matplotlib inline
from collections import defaultdict
import json

import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl

#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
#rcParams['font.family'] = 'sans-serif'
#rcParams['font.sans-serif'] = 'Helvetica'


def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()
        
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)

#utility functions

def histogram_style():
    remove_border(left=False)
    plt.grid(False)
    plt.grid(axis='y', color='w', linestyle='-', lw=1)
    
def histogram_labels(xlabel, ylabel, title, loc):    
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.legend(frameon=False, loc=loc)
    
def histogram_settings(xlabel, ylabel, title, loc = 'upper left'):
    histogram_style() 
    histogram_labels(xlabel, ylabel, title, loc)



In [ ]:

    
from matplotlib.colors import ListedColormap
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

def points_plot(ax, Xtr, Xte, ytr, yte, clf, mesh=True, colorscale=cmap_light, cdiscrete=cmap_bold, alpha=0.1, psize=10, zfunc=False, predicted=False):
    h = .02
    X=np.concatenate((Xtr, Xte))
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))

    #plt.figure(figsize=(10,6))
    if zfunc:
        p0 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 0]
        p1 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
        Z=zfunc(p0, p1)
    else:
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    ZZ = Z.reshape(xx.shape)
    if mesh:
        plt.pcolormesh(xx, yy, ZZ, cmap=cmap_light, alpha=alpha, axes=ax)
    if predicted:
        showtr = clf.predict(Xtr)
        showte = clf.predict(Xte)
    else:
        showtr = ytr
        showte = yte
    ax.scatter(Xtr[:, 0], Xtr[:, 1], c=showtr-1, cmap=cmap_bold, s=psize, alpha=alpha,edgecolor="k")
    # and testing points
    ax.scatter(Xte[:, 0], Xte[:, 1], c=showte-1, cmap=cmap_bold, alpha=alpha, marker="s", s=psize+10)
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    return ax,xx,yy

def points_plot_prob(ax, Xtr, Xte, ytr, yte, clf, colorscale=cmap_light, cdiscrete=cmap_bold, ccolor=cm, psize=10, alpha=0.1):
    ax,xx,yy = points_plot(ax, Xtr, Xte, ytr, yte, clf, mesh=False, colorscale=colorscale, cdiscrete=cdiscrete, psize=psize, alpha=alpha, predicted=True) 
    Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap=ccolor, alpha=.2, axes=ax)
    cs2 = plt.contour(xx, yy, Z, cmap=ccolor, alpha=.6, axes=ax)
    plt.clabel(cs2, fmt = '%2.1f', colors = 'k', fontsize=14, axes=ax)
    return ax

Some useful methods to train



In [ ]:

    
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV

"""
Function
--------
def cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=5)
Performs cross validation of n_folds using a list of values for a chosen parameter (say regularization parameter, C) and returns
the parameter with the best performance


Parameters
----------
clf : Model (eg: LogisticRegression())
parameters: List of parameter values to test eg. {"C": [0.01, 0.1, 1, 10, 100]}
Xtrain: Training features
ytrain: Training labels
n_folds: No of cross validation folds eg. 5
  
Returns
-------
best: Best parameter from the values given
    
"""
def cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=5):
    gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds)
    gs.fit(Xtrain, ytrain)
    print "BEST PARAMS", gs.best_params_
    best = gs.best_estimator_
    return best
"""
Function
--------
def do_classify(clf, parameters, indf, featurenames, targetname, target1val, standardize=False, train_size=0.8)
Trains a model and computes the accuracy on train and test data. Sub-tasks included:
1. Subset data frame by the given features
2. Split in to train and test data
3. Cross validation using cv_optimize
4. Fit the model
5. Compute and print train and test accuracy


Parameters
----------
clf : Model (eg: LogisticRegression())
parameters: List of parameter values to test eg. {"C": [0.01, 0.1, 1, 10, 100]}
indf: data frame
featurenames: List of feature names
targetname: Name of prediction variable
target1val: Value of prediction variable
  
Returns
-------
clf, Xtrain, ytrain, Xtest, ytest
"""
def do_classify(clf, parameters, indf, featurenames, targetname, target1val, standardize=False, train_size=0.8):
    subdf=indf[featurenames]
    if standardize:
        subdfstd=(subdf - subdf.mean())/subdf.std()
    else:
        subdfstd=subdf
    X=subdfstd.values
    y=(indf[targetname].values==target1val)*1
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=train_size)
    clf = cv_optimize(clf, parameters, Xtrain, ytrain)
    clf=clf.fit(Xtrain, ytrain)
    training_accuracy = clf.score(Xtrain, ytrain)
    test_accuracy = clf.score(Xtest, ytest)
    print "Accuracy on training data: %0.2f" % (training_accuracy)
    print "Accuracy on test data:     %0.2f" % (test_accuracy)
    return clf, Xtrain, ytrain, Xtest, ytest

Getting the data

Get json data



In [ ]:

    
import simplejson as json
with open('./data/us_state_map.geojson','r') as fp:
    statedata = json.load(fp)
with open('./data/us_county_map.geojson','r') as fp:
    data = json.load(fp)

Get json data from url



In [1]:

    
#your code here
def get_senate_vote(vote):
    url = 'https://www.govtrack.us/data/congress/113/votes/2013/s'+str(vote)+'/data.json'
    vote_data = requests.get(url)
    return json.loads(vote_data.text)

Get data from a url using beautiful soup



In [2]:

    
#Your code here
def get_all_votes():
    url = 'http://www.govtrack.us/data/congress/113/votes/2013'
    response = requests.get(url)
    soup = bs(response.content, "lxml")
    s_votes = [a['href'][1:-1] for a in soup.find_all('a')          
     if a['href'].startswith('s')]
    return [get_senate_vote(vote) for vote in s_votes]

Useful functions and tricks



In [ ]:

    
fulldf=pd.read_csv("bigdf.csv")

Recompute dataframe

The following function is used to re-compute review counts and averages whenever you subset a reviews data frame. We'll use it soon to construct a smaller, more computationally tractable data frame.



In [ ]:

    
def recompute_frame(ldf):
    """
    takes a dataframe ldf, makes a copy of it, and returns the copy
    with all averages and review counts recomputed
    this is used when a frame is subsetted.
    """
    ldfu=ldf.groupby('user_id')
    ldfb=ldf.groupby('business_id')
    user_avg=ldfu.stars.mean()
    user_review_count=ldfu.review_id.count()
    business_avg=ldfb.stars.mean()
    business_review_count=ldfb.review_id.count()
    nldf=ldf.copy()
    nldf.set_index(['business_id'], inplace=True)
    nldf['business_avg']=business_avg
    nldf['business_review_count']=business_review_count
    nldf.reset_index(inplace=True)
    nldf.set_index(['user_id'], inplace=True)
    nldf['user_avg']=user_avg
    nldf['user_review_count']=user_review_count
    nldf.reset_index(inplace=True)
    return nldf



In [ ]:

    
smalldf = fulldf[(fulldf.business_review_count > 150) & (fulldf.user_review_count > 60)] 
smalldf_new = recompute_frame(smalldf)

Compute Common Support

The common support is an important concept, as for each pair of restaurants, its the number of people who reviewed both. It will be used to modify similarity between restaurants. If the common support is low, the similarity is less believable.



In [ ]:

    
restaurants=smalldf.business_id.unique()  #get all the unique restaurant ids
supports=[]
for i,rest1 in enumerate(restaurants):  # first restaurant (rest1) in the pair
    for j,rest2 in enumerate(restaurants): # second restaurant(rest2) in the pair
        if  i < j: #skip pairing same restaurants and forming duplicate pairs
            rest1_reviewers = smalldf[smalldf.business_id==rest1].user_id.unique() #find all unique users who reviewed restaurant 1
            rest2_reviewers = smalldf[smalldf.business_id==rest2].user_id.unique() #find all unique users who reviewed restaurant 2
            common_reviewers = set(rest1_reviewers).intersection(rest2_reviewers) # find common reviewers by taking intersection
            supports.append(len(common_reviewers)) # add the no of common reviewers to list
print "Mean support is:",np.mean(supports)
plt.hist(supports) #plot hist of list
histogram_style()

Return X (column value) from a df given Y (another column value)



In [ ]:

    
# test business id
testbizid="eIxSLxzIlfExI6vgAbn2JA"



In [ ]:

    
# Return business name from a df given id
def biznamefromid(df, theid):
    return df['biz_name'][df['business_id']==theid].values[0]



In [ ]:

    
print testbizid, biznamefromid(smalldf,testbizid)

Plotting the data

Histograms

Simulating Elections



In [ ]:

    
predictwise = pd.read_csv('data/predictwise.csv').set_index('States')
predictwise.head(10)



In [ ]:

    
#Your code here
def simulate_election(model, n_sim):
    simulations = np.random.uniform(size= (51,n_sim))
    obama_votes = (simulations < model.Obama.values.reshape(-1,1))* model.Votes.values.reshape(-1,1)
    results  = obama_votes.sum(axis = 0)
    return results



In [ ]:

    
result = simulate_election(predictwise, 10000)
result



In [ ]:

    
#your code here
def plot_simulation(simulation):    
    plt.hist(simulation, bins=np.arange(200, 538, 1), 
             label='simulations', align='left', normed=True)
    plt.axvline(332, 0, .5, color='r', label='Actual Outcome')
    plt.axvline(269, 0, .5, color='k', label='Victory Threshold')
    p05 = np.percentile(simulation, 5.)
    p95 = np.percentile(simulation, 95.)
    iq = int(p95 - p05)
    pwin = ((simulation >= 269).mean() * 100)
    plt.title("Chance of Obama Victory: %0.2f%%, Spread: %d votes" % (pwin, iq))
    plt.legend(frameon=False, loc='upper left')
    plt.xlabel("Obama Electoral College Votes")
    plt.ylabel("Probability")
    remove_border()



In [ ]:

    
plot_simulation(result)
plt.xlim(240,380)

Movie Reviews



In [ ]:

    
# import data
critics = pd.read_csv('critics.csv') # Rotten Tomatoes Top Critics Reviews Data for about 3000 movies

# clean data
critics = critics[~critics.quote.isnull()]
critics = critics[critics.fresh.notnull()]
critics = critics[critics.quote.str.len() > 0]
critics = critics.reset_index(drop = True)
critics.head()

What does the distribution of number of reviews per reviewer look like?



In [ ]:

    
plt.hist(critics.groupby('critic').rtid.count(),log = True, bins=range(20), edgecolor='white')
plt.xlabel("Number of reviews per critic")
plt.ylabel("N")
histogram_style()

Of the critics with > 100 reviews, plot the distribution of average "freshness" rating per critic



In [ ]:

    
df = critics.copy()
df['fresh'] = df.fresh == 'fresh'

grp = df.groupby('critic')
counts = grp.rtid.count()
means = grp.fresh.mean()
means[counts > 100].hist(bins=10, edgecolor='w', lw=1)
plt.xlabel("Average rating per critic")
plt.ylabel("N")
plt.yticks([0, 2, 4, 6, 8, 10])
histogram_style()

Using the original movies dataframe, plot the rotten tomatoes Top Critics Rating as a function of year. Overplot the average for each year, ignoring the score=0 examples (some of these are missing data).



In [ ]:

    
# Get the data
from io import StringIO  
movie_txt = requests.get('https://raw.github.com/cs109/cs109_data/master/movies.dat').text
movie_file = StringIO(movie_txt) # treat a string like a file
movies = pd.read_csv(movie_file, delimiter='\t')
movies = movies.dropna()
movies = movies.reset_index(drop = True)

# process data
data = movies[['year','rtTopCriticsRating']]
#data = data.convert_objects(convert_numeric=True) #deprecated
data = data.apply(pd.to_numeric)
data = data[(data.rtTopCriticsRating > 0.00)]
means  = data.groupby('year').mean()

# plotting
plt.plot(data['year'], data['rtTopCriticsRating'], 'o', mec = 'none', alpha = .5, label = 'Data' )
plt.plot(means.index, means['rtTopCriticsRating'], '-',  label = 'Yearly Average' )
plt.legend(loc ='lower left', frameon = False)
plt.xlabel("Year")
plt.ylabel("Average Score")
remove_border()

This graph shows a trend towards a lower average score, as well as a greater abundance of low scores, with time. This is probably at least partially a selection effect -- Rotten Tomatoes probably doesn't archive reviews for all movies, especially ones that came out before the website existed. Thus, reviews of old movies are more often "the classics". Mediocre old movies have been partially forgotten, and are underrepresented in the data. Other questions worth exploring: Does the down trend mean:

the quality of movies has dropped ?
the top critics became more stringent in giving higher ratings?
more top critics with stringent ratings entered the scene?



In [ ]:

    
df=pd.read_csv("01_heights_weights_genders.csv")
df.head()



In [ ]:

    
from sklearn.linear_model import LogisticRegression
clf_l, Xtrain_l, ytrain_l, Xtest_l, ytest_l  = do_classify(LogisticRegression(), {"C": [0.01, 0.1, 1, 10, 100]}, df, ['Weight', 'Height'], 'Gender','Male')



In [ ]:

    
plt.figure(figsize =(12,8))
plt.xlabel("Weight")
plt.ylabel("Height")
ax=plt.gca()
points_plot(ax, Xtrain_l, Xtest_l, ytrain_l, ytest_l, clf_l, alpha=0.2);

Let us plot the probabilities obtained from predict_proba, overlayed on the samples with their true labels:



In [ ]:

    
plt.figure(figsize =(12,8))
plt.grid(True)
ax=plt.gca()
points_plot_prob(ax, Xtrain_l, Xtest_l, ytrain_l, ytest_l, clf_l, psize=20, alpha=0.1);

Linear Discriminant Analysis



In [ ]:

    
from sklearn.lda import LDA
clflda = LDA(solver="svd", store_covariance=True)
clflda.fit(Xtrain_l, ytrain_l)



In [ ]:

    
#from REF
from scipy import linalg

def plot_ellipse(splot, mean, cov, color):
    v, w = linalg.eigh(cov)
    u = w[0] / linalg.norm(w[0])
    angle = np.arctan(u[1] / u[0])
    angle = 180 * angle / np.pi  # convert to degrees
    # filled Gaussian at 2 standard deviation
    ell = mpl.patches.Ellipse(mean, 2 * v[0] ** 0.5, 2 * v[1] ** 0.5,
                              180 + angle, color=color, lw=3, fill=False)
    ell.set_clip_box(splot.bbox)
    ell1 = mpl.patches.Ellipse(mean, 1 * v[0] ** 0.5, 1 * v[1] ** 0.5,
                              180 + angle, color=color, lw=3, fill=False)
    ell1.set_clip_box(splot.bbox)
    ell3 = mpl.patches.Ellipse(mean, 3 * v[0] ** 0.5, 3 * v[1] ** 0.5,
                              180 + angle, color=color, lw=3, fill=False)
    ell3.set_clip_box(splot.bbox)
    #ell.set_alpha(0.2)
    splot.add_artist(ell)
    splot.add_artist(ell1)
    splot.add_artist(ell3)


    #splot.set_xticks(())
    #splot.set_yticks(())
def plot_lda_cov(lda, splot):
    plot_ellipse(splot, lda.means_[0], lda.covariance_, 'red')
    plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue')
#plt.bivariate_normal(X, Y, sigmax=1.0, sigmay=1.0, mux=0.0, muy=0.0, sigmaxy=0.0)¶



In [ ]:

    
plt.figure(figsize =(12,8))
ax=plt.gca()
spl,_,_=points_plot(ax,Xtrain_l, Xtest_l, ytrain_l, ytest_l, clflda)
plot_lda_cov(clflda, spl)

Calibration Plot



In [ ]:

    
dfd=pd.read_csv("https://dl.dropboxusercontent.com/u/75194/pima.csv")
dfd.head()



In [ ]:

    
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
features=['npreg', 'bmi','diaped', 'age', 'glconc','insulin']
X=dfd[features].values
y=dfd.diabetes.values
from sklearn.naive_bayes import GaussianNB
clf=GaussianNB()
itr,ite=train_test_split(range(y.shape[0]))
Xtr=X[itr]
Xte=X[ite]
ytr=y[itr]
yte=y[ite]
clf.fit(Xtr,ytr)
print "Frac of mislabeled points",float((yte != clf.predict(Xte)).sum())/yte.shape[0]
confusion_matrix(clf.predict(Xte),yte)



In [ ]:

    
def calibration_plot(clf, xtest, ytest):
    prob = clf.predict_proba(xtest)[:, 1]
    outcome = ytest
    data = pd.DataFrame(dict(prob=prob, outcome=outcome))

    #group outcomes into bins of similar probability
    bins = np.linspace(0, 1, 20)
    cuts = pd.cut(prob, bins)
    binwidth = bins[1] - bins[0]
    
    #freshness ratio and number of examples in each bin
    cal = data.groupby(cuts).outcome.agg(['mean', 'count'])
    cal['pmid'] = (bins[:-1] + bins[1:]) / 2
    cal['sig'] = np.sqrt(cal.pmid * (1 - cal.pmid) / cal['count'])
        
    #the calibration plot
    ax = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
    p = plt.errorbar(cal.pmid, cal['mean'], cal['sig'])
    plt.plot(cal.pmid, cal.pmid, linestyle='--', lw=1, color='k')
    plt.ylabel("Empirical Fraction")

    
    #the distribution of P(fresh)
    ax = plt.subplot2grid((3, 1), (2, 0), sharex=ax)
    #calsum = cal['count'].sum()
    plt.bar(left=cal.pmid - binwidth / 2, height=cal['count'],
            width=.95 * (bins[1] - bins[0]),
            fc=p[0].get_color())
    plt.xlabel("Classifier Probability")



In [ ]:

    
calibration_plot(clf, Xte, yte)

Coin tosses: Binomial-Beta



In [ ]:

    
plt.figure(figsize=(11, 9))

import scipy.stats as stats

beta = stats.beta
n_trials = [0, 1, 2, 3, 4, 5, 8, 15, 50, 500]
data = stats.bernoulli.rvs(0.5, size=n_trials[-1])
x = np.linspace(0, 1, 100)

# For the already prepared, I'm using Binomial's conj. prior.
for k, N in enumerate(n_trials):
    sx = plt.subplot(len(n_trials) / 2, 2, k + 1)
    plt.xlabel("$p$, probability of heads") \
        if k in [0, len(n_trials) - 1] else None
    plt.setp(sx.get_yticklabels(), visible=False)
    heads = data[:N].sum()
    #posterior distribution.
    y = beta.pdf(x, 1 + heads, 1 + N - heads)
    plt.plot(x, y, label="observe %d tosses,\n %d heads" % (N, heads))
    plt.fill_between(x, 0, y, color="#348ABD", alpha=0.4)
    plt.vlines(0.5, 0, 4, color="k", linestyles="--", lw=1)

    leg = plt.legend()
    leg.get_frame().set_alpha(0.4)
    plt.autoscale(tight=True)


plt.suptitle("Bayesian updating of posterior probabilities",
             y=1.02,
             fontsize=14)

plt.tight_layout()

Gaussian with known $\sigma$



In [ ]:

    
Y = [16.4, 17.0, 17.2, 17.4, 18.2, 18.2, 18.2, 19.9, 20.8]
# Prior mean
mu_prior = 19.5
# prior std
tau = 10 
N = 15000



In [ ]:

    
#Data Quantities
sig = np.std(Y) # assume that is the value of KNOWN sigma (in the likelihood)
mu_data = np.mean(Y)
n = len(Y)
kappa = sig**2 / tau**2
sig_post =np.sqrt(1./( 1./tau**2 + n/sig**2));
# posterior mean
mu_post = kappa / (kappa + n) *mu_prior + n/(kappa+n)* mu_data

#samples
theta_prior = np.random.normal(loc=mu_prior, scale=tau, size=N);
theta_post = np.random.normal(loc=mu_post, scale=sig_post, size=N);



In [ ]:

    
plt.hist(theta_post, bins=30, alpha=0.9, label="posterior");
plt.hist(theta_prior, bins=30, alpha=0.2, label="prior");
plt.xlim([10, 30])
plt.xlabel("wing length (mm)")
plt.ylabel("Number of samples")
plt.legend();

Other Resources

Use of python class to implement a database of similarities: See CS109 HW4
Print formatting: See CS109 HW4 or HW3
Use of JSON and complex dictionaries: HW5



In [ ]: