Chapter 07


In [58]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels as sms
%matplotlib inline

6


In [2]:
wage_file_name = '../data/Wage.csv'
wages = pd.read_csv(wage_file_name, index_col=0)
wages.head()


Out[2]:
year age sex maritl race education region jobclass health health_ins logwage wage
231655 2006 18 1. Male 1. Never Married 1. White 1. < HS Grad 2. Middle Atlantic 1. Industrial 1. <=Good 2. No 4.318063 75.043154
86582 2004 24 1. Male 1. Never Married 1. White 4. College Grad 2. Middle Atlantic 2. Information 2. >=Very Good 2. No 4.255273 70.476020
161300 2003 45 1. Male 2. Married 1. White 3. Some College 2. Middle Atlantic 1. Industrial 1. <=Good 1. Yes 4.875061 130.982177
155159 2003 43 1. Male 2. Married 3. Asian 4. College Grad 2. Middle Atlantic 2. Information 2. >=Very Good 1. Yes 5.041393 154.685293
11443 2005 50 1. Male 4. Divorced 1. White 2. HS Grad 2. Middle Atlantic 2. Information 1. <=Good 1. Yes 4.318063 75.043154

6(a)


In [6]:
wages.plot.scatter(x='age',y='wage')
plt.show()



In [13]:
wages['age_2'] = wages['age']**2
wages['age_3'] = wages['age']**3
wages['age_4'] = wages['age']**4
wages['age_5'] = wages['age']**5
wages['age_6'] = wages['age']**6
wages['age_7'] = wages['age']**7
wages['age_8'] = wages['age']**8
wages['age_9'] = wages['age']**9

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
def validation(degree_expression):
    '''
    degress_expression is like ['age','age2','age3']
    '''
    X = wages[degree_expression].values
    y = wages['wage'].values
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.4, random_state=0)
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    return lr.score(X_test,y_test)
    #residual = lr.predict(X_test) - y_test
    #return np.sum(residual*residual)/residual.shape[0]
    #clf = LinearRegression()
    #return cross_val_score(clf, X, y, cv=2)
expressions = [['age'],['age','age_2'],['age','age_2','age_3'],
               ['age','age_2','age_3','age_4'],['age','age_2','age_3','age_4','age_5'],
               ['age','age_2','age_3','age_4','age_5','age_6'],
              ['age','age_2','age_3','age_4','age_5','age_6','age_7'],
              ['age','age_2','age_3','age_4','age_5','age_6','age_7','age_8'],
              ['age','age_2','age_3','age_4','age_5','age_6','age_7','age_8','age_9']]
mses = []
for expression in expressions:
    mses.append(validation(expression))
degrees = range(1,10,1)
plt.plot(degrees, mses)
plt.show()



In [30]:
X = wages[['age','age_2','age_3']].values
y = wages['wage'].values
lr = LinearRegression()
lr.fit(X,y)
coef = lr.coef_
inter = lr.intercept_
x = np.linspace(10,80,200)
y = coef[0] * x**1 + coef[1] * x**2 + coef[2] * x**3 + inter
plt.scatter(wages['age'].values, wages['wage'].values)
plt.plot(x,y,c='r')
plt.show()


6(b)


In [47]:
def stepwise(steps):
    '''
    the step point
    '''
    steps = [-np.inf] + steps + [np.inf]
    y = wages['wage'].values
    x = wages['age'].values
    df = wages[['age']]
    indexes = []
    for i in range(len(steps)-1):
        index = 'x'+str(i)
        df[index] = [1 if steps[i] <= item < steps[i+1] else 0 for item in x]
        indexes.append(index)
    X = df[indexes]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    del df
    return lr.score(X_test,y_test)
cuts = [2,4,6,8,10]
steps = []
for cut in cuts:
    steps.append(np.linspace(0,90,cut).tolist())
scores = []
for step in steps:
    scores.append(stepwise(step))
plt.plot(cuts, scores)
plt.xlabel('Number of cut')
plt.ylabel('validation set scores')
plt.show()


/Users/gaufung/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

When number of cut is 8, it has highest validation scores.

7


In [53]:
wages['maritl'].unique()


Out[53]:
array(['1. Never Married', '2. Married', '4. Divorced', '3. Widowed',
       '5. Separated'], dtype=object)

In [59]:
wages[['wage','maritl']].boxplot(by='maritl')


Out[59]:
<matplotlib.axes._subplots.AxesSubplot at 0x1197f5f60>

In [60]:
wages['jobclass'].unique()


Out[60]:
array(['1. Industrial', '2. Information'], dtype=object)

In [61]:
wages[['wage','jobclass']].boxplot(by='jobclass')


Out[61]:
<matplotlib.axes._subplots.AxesSubplot at 0x118a1dd30>

8


In [65]:
auto_file_name = '../data/Auto'
autos = pd.read_table(auto_file_name, sep='\s+', na_values='?')
autos.head()


Out[65]:
mpg cylinders displacement horsepower weight acceleration year origin name
0 18.0 8 307.0 130.0 3504.0 12.0 70 1 chevrolet chevelle malibu
1 15.0 8 350.0 165.0 3693.0 11.5 70 1 buick skylark 320
2 18.0 8 318.0 150.0 3436.0 11.0 70 1 plymouth satellite
3 16.0 8 304.0 150.0 3433.0 12.0 70 1 amc rebel sst
4 17.0 8 302.0 140.0 3449.0 10.5 70 1 ford torino

In [66]:
autos = autos.dropna()
autos.head()


Out[66]:
mpg cylinders displacement horsepower weight acceleration year origin name
0 18.0 8 307.0 130.0 3504.0 12.0 70 1 chevrolet chevelle malibu
1 15.0 8 350.0 165.0 3693.0 11.5 70 1 buick skylark 320
2 18.0 8 318.0 150.0 3436.0 11.0 70 1 plymouth satellite
3 16.0 8 304.0 150.0 3433.0 12.0 70 1 amc rebel sst
4 17.0 8 302.0 140.0 3449.0 10.5 70 1 ford torino

In [73]:
from pandas.tools.plotting import scatter_matrix
fig, ax = plt.subplots(figsize=(15, 15))
scatter_matrix(autos,ax=ax);


/Users/gaufung/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2881: UserWarning: To output multiple subplots, the figure containing the passed axes is being cleared
  exec(code_obj, self.user_global_ns, self.user_ns)

mpg appears proportional to displcement, horsepower,weigth and acceleration

9


In [77]:
boston_file_name = '../data/Boston.csv'
bostons = pd.read_csv(boston_file_name, index_col=0)
bostons.plot.scatter(x='dis', y='nox');


9(a)


In [85]:
bostons['dis_2'] = bostons['dis']**2
bostons['dis_3'] = bostons['dis']**3
X = bostons[['dis','dis_2','dis_3']].values
y = bostons['nox'].values
lr = LinearRegression()
lr.fit(X, y)
x = np.linspace(0,12,100)
x = np.vstack((x,x**2,x**3)).T
pred_y = lr.predict(x)
plt.scatter(bostons['dis'].values, y)
plt.plot(np.linspace(0,12,100), pred_y, c='r')
plt.show()



In [86]:
def degree_fit(degree):
    x = bostons['dis'].values
    y = bostons['nox'].values
    stacks = [x]
    for i in range(2,degree+1):
        stacks.append(x**i)
    X = np.vstack(stacks).T
    lr = LinearRegression()
    lr.fit(X, y)
    residual = lr.predict(X) - y
    return np.sum(residual * residual)
degrees = range(1,11)
residuals = []
for degree in degrees:
    residuals.append(degree_fit(degree))
plt.plot(degrees, residuals)
plt.show()



In [95]:
def valid_degree(degree):
    x = bostons['dis'].values
    y = bostons['nox'].values
    stacks = [x]
    for i in range(2,degree+1):
        stacks.append(x**i)
    X = np.vstack(stacks).T
    from sklearn.model_selection import cross_val_score
    lr = LinearRegression()
    scores = cross_val_score(lr, X, y, cv=5)
    return scores.mean()
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    #lr = LinearRegression()
    #lr.fit(X_train, y_train)
    #return lr.score(X_test, y_test)
    #residual = lr.predict(X_test) - y_test
    #return np.sum(residual*residual)
degrees = range(1,11)
scores = []
for degree in degrees:
    scores.append(valid_degree(degree))
plt.plot(degrees, scores)
plt.show()



In [ ]: