In [1]:
print("hello world")
In [2]:
########################################
# 1 #
########################################
# Load the boston dataset included with sklearn
##### Start solution code #####
from sklearn import datasets
dataset = datasets.load_boston()
##### End solution code #####
In [3]:
########################################
# 2 #
########################################
# Run this cell to see what it does.
# Then modify the code to print both the description and the data point.
dataset.DESCR
dataset.data[0]
##### Start solution code #####
print(dataset.DESCR)
print(dataset.data[0])
##### end solution code
In [4]:
# Make plots appear inline rather than in a separate window
# no-import-all prevents importing * from numpy and matplotlib
%pylab inline --no-import-all
# Import some useful libraries
import scipy
import numpy
import pandas
import seaborn # Importing seaborn automatically makes our plots look better
import matplotlib.pyplot as pyplt
In [5]:
df = pandas.read_csv("candy_choices.csv")
df.count()
Out[5]:
In [6]:
# Each event will contain a tuple (selection index, selection, time since previous selection)
event_list = []
i = 0
time_since_last = {}
for item in df["candy"].values:
if item in time_since_last:
event_list.append((i, item, time_since_last[item]))
for e in time_since_last.keys():
time_since_last[e]+=1
time_since_last[item] = 0
i += 1
In [7]:
event_list[:10]
Out[7]:
In [8]:
def plot_interselection_time(events, color, candy_name):
# Pull out the interselection times for the appropriate candy
candy = []
for (i, choice, time) in events:
if choice == candy_name:
candy.append(time)
# Plot the interselection times
plt.plot(range(len(candy)), candy, color=color, label=candy_name)
# Add a legend and label the axes
plt.legend(frameon=True, shadow=True, framealpha=0.7, loc=0, prop={'size':14})
plt.xlabel("Selection number", fontsize=14)
plt.ylabel("Interselection time", fontsize=14)
In [9]:
plot_interselection_time(event_list, "orange", "airhead")
In [10]:
plot_interselection_time(event_list, "red", "starburst")
plot_interselection_time(event_list, "orange", "airhead")
In [11]:
########################################
# 3 #
########################################
# Modify this function so that a 5 on the x-axis corresponds to
# the 5th time any candy was chosen
def plot_interselection_time_scaled(events, color, candy_name):
# Pull out the interselection times for the appropriate candy
candy = []
for (i, choice, time) in events:
if choice == candy_name:
candy.append(time)
# Plot the interselection times
plt.plot(range(len(candy)), candy, color=color, label=candy_name)
# Add a legend and label the axes
plt.legend(frameon=True, shadow=True, framealpha=0.7, loc=0, prop={'size':14})
plt.xlabel("Selection number", fontsize=14)
plt.ylabel("Interselection time", fontsize=14)
##### Start solution code #####
def plot_interselection_time_scaled(events, color, candy_name):
# Pull out the interselection times for the appropriate candy
candy = []
selection_numbers = []
for (i, choice, time) in events:
if choice == candy_name:
candy.append(time)
selection_numbers.append(i)
# Plot the interselection times
plt.plot(selection_numbers, candy, color=color, label=candy_name)
# Add a legend and label the axes
plt.legend(frameon=True, shadow=True, framealpha=0.7, loc=0, prop={'size':14})
plt.xlabel("Selection number", fontsize=14)
plt.ylabel("Interselection time", fontsize=14)
##### End solution code
In [12]:
plot_interselection_time_scaled(event_list, "orange", "airhead")
In [13]:
plot_interselection_time_scaled(event_list, "red", "starburst")
plot_interselection_time_scaled(event_list, "orange", "airhead")
In [14]:
plot_interselection_time_scaled(event_list, "blue", "reeses")
plot_interselection_time_scaled(event_list, "green", "rolo")
plot_interselection_time_scaled(event_list, "yellow", "kitkat")
plot_interselection_time_scaled(event_list, "purple", "hersheys")
plot_interselection_time_scaled(event_list, "red", "starburst")
plot_interselection_time_scaled(event_list, "orange", "airhead")
In [15]:
plot_interselection_time_scaled(event_list, "blue", "reeses")
plot_interselection_time_scaled(event_list, "green", "rolo")
In [16]:
# Each sharedStateEvent will be a map from all candy types to the time since that candy was selected
shared_state_events = [{"airhead":0, "starburst":0, "hersheys":0, "reeses":0, "kitkat":0, "rolo":0}]
import copy
i = 0
time_since_last = {}
for item in df["candy"].values:
if not item in time_since_last:
time_since_last[item] = 0
event_list.append((i, item, time_since_last[item]))
curr_shared_event = copy.deepcopy(shared_state_events[-1])
curr_shared_event[item] = time_since_last[item]
shared_state_events.append(curr_shared_event)
time_since_last[item] = 0
for e in time_since_last.keys():
if e!=item:
time_since_last[e]+=1
i = i+1
In [17]:
events_frame = pandas.DataFrame(shared_state_events)
In [18]:
events_frame
Out[18]:
In [19]:
# Set a random seed so we will get the same results each time
import random
random.seed(5656)
# Randomly select 30 events for our test set
test_indices = set(random.sample(range(events_frame.shape[0]), 30))
# Split our data into training and test data
train_features = []
train_labels = []
test_features = []
test_labels = []
i = 0
for airhead, hersheys, kitkat, reeses, rolo, starburst in events_frame.values:
if i in test_indices:
# Use starburst as our label, and all others as our features
test_features.append([airhead, hersheys, kitkat, reeses, rolo])
test_labels.append(starburst)
else:
train_features.append([airhead, hersheys, kitkat, reeses, rolo])
train_labels.append(starburst)
i += 1
In [20]:
import sklearn
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
model = linear_model.LinearRegression()
model.fit(train_features, train_labels)
Out[20]:
In [21]:
# See which features had the most influence on our model
zip(events_frame.columns, model.coef_)
Out[21]:
In [22]:
# Print mean squared error and R^2 on the training set
print(numpy.mean((model.predict(train_features) - train_labels) ** 2))
print(model.score(train_features, train_labels))
In [23]:
# Plot predicted and true interarrival times on the training set
plt.plot(train_labels, color="green", label="True value")
plt.plot(model.predict(train_features), label="Predicted value")
plt.xlabel("Selection number", fontsize=14)
plt.ylabel("Interselection time", fontsize=14)
plt.legend(frameon=True, shadow=True, framealpha=0.7, loc=0, prop={"size": 14})
Out[23]:
In [24]:
# Print mean squared error and R^2 on the test set
print(numpy.mean((model.predict(test_features) - test_labels) ** 2))
print(model.score(test_features, test_labels))
In [25]:
# Plot predicted and true time since selection on the test set
plt.plot(test_labels, color="green", label="True value")
plt.plot(model.predict(test_features), label="Predicted value")
plt.xlabel("Selection number", fontsize=14)
plt.ylabel("Time since selection", fontsize=14)
plt.legend(frameon=True, shadow=True, framealpha=0.7, loc=0, prop={"size": 14})
Out[25]:
In [26]:
# Restrict the features to just Airhead and Kitkat - the two most influential features
train_features_res = [[e[0], e[2]] for e in train_features]
train_labels_res = train_labels
test_features_res = [[e[0], e[2]] for e in test_features]
test_labels_res = test_labels
model_res = linear_model.LinearRegression()
model_res.fit(train_features_res, train_labels_res)
Out[26]:
In [27]:
# Plot predicted and true interarrival times on the training set
plt.plot(train_labels_res, color="green", label="True interselection time")
plt.plot(model_res.predict(train_features_res), label="Predicted interselection time")
plt.xlabel("Selection number", fontsize=14)
plt.ylabel("Interselection time", fontsize=14)
plt.legend(frameon=True, shadow=True, framealpha=0.7, loc=0, prop={"size": 14})
Out[27]:
In [28]:
# Print the mean squared error and R^2 of the restricted model on the training set
print(numpy.mean((model_res.predict(train_features_res) - train_labels_res) ** 2))
print(model_res.score(train_features_res, train_labels_res))
In [29]:
# Plot predicted and true interarrival times on the test set
plt.plot(test_labels_res, color="green", label="True interselecton time")
plt.plot(model_res.predict(test_features_res), label="Predicted interselection time")
plt.xlabel("Selection number", fontsize=14)
plt.ylabel("Interselection time", fontsize=14)
plt.legend(frameon=True, shadow=True, framealpha=0.7, loc=0, prop={"size": 14})
Out[29]:
In [30]:
# Print the mean squared error and R^2 of the restricted model on the test set
print(numpy.mean((model_res.predict(test_features_res) - test_labels_res) ** 2))
print(model_res.score(test_features_res, test_labels_res))