In [1]:
from sklearn.datasets import load_boston
boston = load_boston()
print boston.DESCR
In [2]:
boston.data.shape
Out[2]:
In [3]:
num_samples = boston.data.shape[0]
num_samples
Out[3]:
In [4]:
boston.data[0]
Out[4]:
In [5]:
boston.target.shape
Out[5]:
In [6]:
figsize(12,8)
In [7]:
scatter(boston.data[:,5], boston.target)
xlabel(u'RM (número médio de cômodos)')
ylabel(u'Valor médio (em US$ 1.000)')
Out[7]:
In [8]:
from sklearn.cross_validation import ShuffleSplit
In [9]:
ssplit = ShuffleSplit(num_samples, n_iter=1, test_size=0.25)
In [10]:
for train_idx, test_idx in ssplit:
pass
In [11]:
train_idx
Out[11]:
In [12]:
n_train = train_idx.shape[0]
n_train
Out[12]:
In [13]:
test_idx
Out[13]:
In [14]:
n_test = test_idx.shape[0]
n_test
Out[14]:
In [15]:
n_train + n_test
Out[15]:
In [16]:
X = boston.data[train_idx,5].reshape(n_train,1)
X.shape
Out[16]:
In [17]:
y = boston.target[train_idx]
In [18]:
from sklearn.linear_model import LinearRegression
In [19]:
regr = LinearRegression()
regr.fit(X, y)
Out[19]:
In [20]:
scatter(X, y)
plot(X, regr.predict(X))
xlabel(u'RM (número médio de cômodos)')
xlim((3,9))
ylabel(u'Valor médio (em US$ 1.000)')
ylim((0,55))
Out[20]:
Erro quadrático médio
In [21]:
mean((regr.predict(X) - y)**2)
Out[21]:
In [22]:
regr.score(X, y)
Out[22]:
In [23]:
X_t = boston.data[test_idx,5].reshape(n_test, 1)
y_t = boston.target[test_idx]
Erro quadrático médio
In [24]:
mean((regr.predict(X_t) - y_t)**2)
Out[24]:
Score
In [25]:
regr.score(X_t, y_t)
Out[25]:
In [26]:
scatter(X_t, y_t)
plot(X_t, regr.predict(X_t))
xlabel(u'RM (número médio de cômodos)')
xlim((3,9))
ylabel(u'Valor médio (em US$ 1.000)')
ylim((0,55))
Out[26]:
In [27]:
scatter(X, y, c='b', marker='o')
scatter(X_t, y_t, c='r', marker='s')
plot(X, regr.predict(X))
plot(X_t, regr.predict(X_t), 'r-')
xlabel(u'RM (número médio de cômodos)')
xlim((3,9))
ylabel(u'Valor médio (em US$ 1.000)')
ylim((0,55))
Out[27]:
In [28]:
regr = LinearRegression()
X = boston.data[train_idx]
y = boston.target[train_idx]
regr.fit(X, y)
Out[28]:
Erro quadrático médio
In [29]:
mean((regr.predict(X) - y)**2)
Out[29]:
Score
In [30]:
regr.score(X, y)
Out[30]:
Erro quadrático médio
In [31]:
X_t = boston.data[test_idx]
y_t = boston.target[test_idx]
mean((regr.predict(X_t) - y_t)**2)
Out[31]:
Score
In [32]:
regr.score(X_t, y_t)
Out[32]:
In [33]:
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
In [34]:
num_samples = diabetes.data.shape[0]
num_samples
Out[34]:
In [35]:
ssplit = ShuffleSplit(num_samples, n_iter=1, test_size=0.25)
In [36]:
for train_idx, test_idx in ssplit:
pass
In [37]:
train_idx
Out[37]:
In [38]:
test_idx
Out[38]:
In [39]:
regr = LinearRegression()
X = diabetes.data[train_idx]
y = diabetes.target[train_idx]
regr.fit(X, y)
Out[39]:
Erro quadrático médio
In [40]:
mean((regr.predict(X) - y)**2)
Out[40]:
In [41]:
X_t = diabetes.data[test_idx]
y_t = diabetes.target[test_idx]
mean((regr.predict(X_t) - y_t)**2)
Out[41]:
In [42]:
regr.score(X_t, y_t)
Out[42]:
In [132]:
ssplit = ShuffleSplit(num_samples, n_iter=1, test_size=0.25)
for train_idx, test_idx in ssplit:
pass
X = boston.data[train_idx]
y = boston.target[train_idx]
X_t = boston.data[test_idx]
y_t = boston.target[test_idx]
In [133]:
from sklearn.linear_model import Lasso
In [134]:
lasso_regr = Lasso()
lasso_regr.set_params(alpha=0.001)
lasso_regr.fit(X, y)
Out[134]:
In [135]:
lasso_regr.score(X_t, y_t)
Out[135]:
In [136]:
boston_vars = ['CRIM (per capita crime rate by town)',
'ZN (proportion of residential land zoned for lots over 25,000 sq.ft.)',
'INDUS (proportion of non-retail business acres per town)',
'CHAS (Charles River dummy variable, = 1 if tract bounds river; 0 otherwise)',
'NOX nitric oxides concentration (parts per 10 million)',
'RM (average number of rooms per dwelling)',
'AGE (proportion of owner-occupied units built prior to 1940)',
'DIS (weighted distances to five Boston employment centres)',
'RAD (index of accessibility to radial highways)',
'TAX (full-value property-tax rate per $10,000)',
'PTRATIO (pupil-teacher ratio by town)',
'B (1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town)',
'LSTAT (lower status of the population)']
In [137]:
for descr, coefficient in zip(boston_vars, lasso_regr.coef_):
print "%.3f\t%s" % (coefficient, descr)