What can you tell me about the data?
In [5]:
import matplotlib.pyplot as plt
%matplotlib inline
In [6]:
import pandas as pd
df = pd.read_csv('../data/energy/energy.csv')
df.shape
Out[6]:
In [7]:
df.describe()
Out[7]:
X2, X3, X4 might be candidates for normalization;
X5, X6, X8 likely to be discrete values;
Y1, Y2 within the same range
In [8]:
import matplotlib.pyplot as plt
%matplotlib inline
from pandas.tools.plotting import scatter_matrix
scatter_matrix(df, alpha=0.2, figsize=(18,18), diagonal='kde')
plt.show()
observations: maybe X5 is binary? X2 and X1 seem to have strong correlation
In [9]:
import matplotlib.pyplot as plt
df.plot()
plt.show()
In [10]:
#Individual Elements in the DF
df['X1'].plot()
plt.show()
In [11]:
# Use the 'kind' keyword for different variations
# Other kinds: 'bar', 'hist', 'box', 'kde',
# 'area', 'scatter', 'hexbin', 'pie'
df.plot(x='X1', y='X2', kind='scatter')
plt.show()
In [12]:
plt.savefig('myfig.png')
In [13]:
# Complex function in pandas.plotting that take DataFrame or Series as arg
# Scatter Matrix, Andrews Curves, Parallel Coordinates, Lag Plot,
# Autocorrelation Plot, Bootstrap Plot, RadViz
In [24]:
from utils import *
from sklearn.cross_validation import train_test_split as tts
dataset = load_energy()
In [20]:
print(dataset)
In [22]:
type(dataset)
Out[22]:
What is a bunch??
We'll talk about that soon.
In the meantime ask for help...
In [23]:
help(dataset)
In [25]:
dataset.data.shape
Out[25]:
In [15]:
dataset.target('Y1').shape
Out[15]:
In [26]:
#other ways to explore 'dataset'
print(dataset.DESCR)
In [27]:
#more ways to exlore 'dataset'
dir(dataset)
Out[27]:
In [28]:
dir(dataset)
Out[28]:
In [32]:
splits = tts(dataset.data, dataset.target('Y1'), test_size=0.2)
In [31]:
# what is splits?
print(splits)
In [33]:
X_train, X_test, y_train, y_test = splits
X_train.shape
Out[33]:
In [34]:
y_train.shape
Out[34]:
In [36]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
regr = linear_model.LinearRegression()
regr.fit(X_train,y_train)
Out[36]:
In [37]:
print regr.coef_
In [38]:
print regr.intercept_
In [39]:
print mean_squared_error(y_test, regr.predict(X_test))
In [40]:
regr.score(X_test,y_test)
# same as doing r2_score(y_est, regr.predict(X_test))
Out[40]:
In [41]:
clf = linear_model.Ridge(alpha=0.5)
clf.fit(X_train, y_train)
Out[41]:
In [42]:
print mean_squared_error(y_test, clf.predict(X_test))
In [43]:
clf.score(X_test, y_test)
Out[43]:
so we picked a bad alpha - let's pick a better one...
In [44]:
import numpy as np
# try 200 different alphas between -10 and -2
n_alphas = 200
alphas = np.logspace(-10, -2, n_alphas)
clf = linear_model.RidgeCV(alphas=alphas)
clf.fit(X_train, y_train)
#which alpha did it pick?
print clf.alpha_
In [45]:
clf.score(X_test, y_test)
Out[45]:
In [54]:
# plot our alphas
linear_model.Ridge(fit_intercept=False)
errors = []
In [55]:
for alpha in alphas:
splits = tts(dataset.data, dataset.target('Y1'), test_size=0.2)
X_train, X_test, y_train, y_test = splits
clf.set_params(alpha=alpha)
clf.fit(X_train, y_train)
error = mean_squared_error(y_test, clf.predict(X_test))
errors.append(error)
In [56]:
axe = plt.gca()
axe.plot(alphas, errors)
plt.show()
In [51]:
clf = linear_model.Lasso(alpha=0.5)
clf.fit(X_train, y_train)
Out[51]:
In [52]:
print mean_squared_error(y_test, clf.predict(X_test))
In [53]:
clf.score(X_test, y_test)
Out[53]:
In [57]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
In [58]:
model = make_pipeline(PolynomialFeatures(2), linear_model. Ridge())
model.fit(X_train, y_train)
Out[58]:
In [59]:
mean_squared_error(y_test, model.predict(X_test))
Out[59]:
In [60]:
model.score(X_test, y_test)
Out[60]:
now it's time to worry about overfit
In [61]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [62]:
x = np.linspace(-15,15,100)
# 100 evenly spaced nums between -15 and 15
y = np.sin(x)/x
# compute values of sin(x) / x
In [66]:
# compose plot
plt.plot(x,y, label="f(x)") # sin(x)/x
plt.plot(x,y, 'co', label="cyan dot f(x)")
plt.plot(x,2*y,x,3*y, label="scaled f(x)")
Out[66]:
In [69]:
# add plot details! Or else Ben will be mad
plt.plot(x,y, label="f(x)")
plt.plot(x,y, 'co', label="cyan dot f(x)")
plt.plot(x,2*y,x,3*y, label="scaled f(x)")
plt.xlabel("x-axis")
plt.ylabel("y-axis")
plt.title("Graph of Functions")
plt.legend()
plt.show()
In [ ]: