Chapter 07
In [58]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels as sms
%matplotlib inline
In [2]:
wage_file_name = '../data/Wage.csv'
wages = pd.read_csv(wage_file_name, index_col=0)
wages.head()
Out[2]:
In [6]:
wages.plot.scatter(x='age',y='wage')
plt.show()
In [13]:
wages['age_2'] = wages['age']**2
wages['age_3'] = wages['age']**3
wages['age_4'] = wages['age']**4
wages['age_5'] = wages['age']**5
wages['age_6'] = wages['age']**6
wages['age_7'] = wages['age']**7
wages['age_8'] = wages['age']**8
wages['age_9'] = wages['age']**9
In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
def validation(degree_expression):
'''
degress_expression is like ['age','age2','age3']
'''
X = wages[degree_expression].values
y = wages['wage'].values
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.4, random_state=0)
lr = LinearRegression()
lr.fit(X_train, y_train)
return lr.score(X_test,y_test)
#residual = lr.predict(X_test) - y_test
#return np.sum(residual*residual)/residual.shape[0]
#clf = LinearRegression()
#return cross_val_score(clf, X, y, cv=2)
expressions = [['age'],['age','age_2'],['age','age_2','age_3'],
['age','age_2','age_3','age_4'],['age','age_2','age_3','age_4','age_5'],
['age','age_2','age_3','age_4','age_5','age_6'],
['age','age_2','age_3','age_4','age_5','age_6','age_7'],
['age','age_2','age_3','age_4','age_5','age_6','age_7','age_8'],
['age','age_2','age_3','age_4','age_5','age_6','age_7','age_8','age_9']]
mses = []
for expression in expressions:
mses.append(validation(expression))
degrees = range(1,10,1)
plt.plot(degrees, mses)
plt.show()
In [30]:
X = wages[['age','age_2','age_3']].values
y = wages['wage'].values
lr = LinearRegression()
lr.fit(X,y)
coef = lr.coef_
inter = lr.intercept_
x = np.linspace(10,80,200)
y = coef[0] * x**1 + coef[1] * x**2 + coef[2] * x**3 + inter
plt.scatter(wages['age'].values, wages['wage'].values)
plt.plot(x,y,c='r')
plt.show()
In [47]:
def stepwise(steps):
'''
the step point
'''
steps = [-np.inf] + steps + [np.inf]
y = wages['wage'].values
x = wages['age'].values
df = wages[['age']]
indexes = []
for i in range(len(steps)-1):
index = 'x'+str(i)
df[index] = [1 if steps[i] <= item < steps[i+1] else 0 for item in x]
indexes.append(index)
X = df[indexes]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
lr = LinearRegression()
lr.fit(X_train, y_train)
del df
return lr.score(X_test,y_test)
cuts = [2,4,6,8,10]
steps = []
for cut in cuts:
steps.append(np.linspace(0,90,cut).tolist())
scores = []
for step in steps:
scores.append(stepwise(step))
plt.plot(cuts, scores)
plt.xlabel('Number of cut')
plt.ylabel('validation set scores')
plt.show()
When number of cut is 8, it has highest validation scores.
In [53]:
wages['maritl'].unique()
Out[53]:
In [59]:
wages[['wage','maritl']].boxplot(by='maritl')
Out[59]:
In [60]:
wages['jobclass'].unique()
Out[60]:
In [61]:
wages[['wage','jobclass']].boxplot(by='jobclass')
Out[61]:
In [65]:
auto_file_name = '../data/Auto'
autos = pd.read_table(auto_file_name, sep='\s+', na_values='?')
autos.head()
Out[65]:
In [66]:
autos = autos.dropna()
autos.head()
Out[66]:
In [73]:
from pandas.tools.plotting import scatter_matrix
fig, ax = plt.subplots(figsize=(15, 15))
scatter_matrix(autos,ax=ax);
mpg appears proportional to displcement, horsepower,weigth and acceleration
In [77]:
boston_file_name = '../data/Boston.csv'
bostons = pd.read_csv(boston_file_name, index_col=0)
bostons.plot.scatter(x='dis', y='nox');
In [85]:
bostons['dis_2'] = bostons['dis']**2
bostons['dis_3'] = bostons['dis']**3
X = bostons[['dis','dis_2','dis_3']].values
y = bostons['nox'].values
lr = LinearRegression()
lr.fit(X, y)
x = np.linspace(0,12,100)
x = np.vstack((x,x**2,x**3)).T
pred_y = lr.predict(x)
plt.scatter(bostons['dis'].values, y)
plt.plot(np.linspace(0,12,100), pred_y, c='r')
plt.show()
In [86]:
def degree_fit(degree):
x = bostons['dis'].values
y = bostons['nox'].values
stacks = [x]
for i in range(2,degree+1):
stacks.append(x**i)
X = np.vstack(stacks).T
lr = LinearRegression()
lr.fit(X, y)
residual = lr.predict(X) - y
return np.sum(residual * residual)
degrees = range(1,11)
residuals = []
for degree in degrees:
residuals.append(degree_fit(degree))
plt.plot(degrees, residuals)
plt.show()
In [95]:
def valid_degree(degree):
x = bostons['dis'].values
y = bostons['nox'].values
stacks = [x]
for i in range(2,degree+1):
stacks.append(x**i)
X = np.vstack(stacks).T
from sklearn.model_selection import cross_val_score
lr = LinearRegression()
scores = cross_val_score(lr, X, y, cv=5)
return scores.mean()
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
#lr = LinearRegression()
#lr.fit(X_train, y_train)
#return lr.score(X_test, y_test)
#residual = lr.predict(X_test) - y_test
#return np.sum(residual*residual)
degrees = range(1,11)
scores = []
for degree in degrees:
scores.append(valid_degree(degree))
plt.plot(degrees, scores)
plt.show()
In [ ]: