Over 370,000 used cars were scraped from Ebay-Kleinanzeigen. The content of the data is in German, so one has to translate it first to English. The data is available here The fields included in the file data/autos.csv are:
Goal
Given the characteristics/features of the car, the sale price of the car is to be predicted.
In [1]:
#import the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (10, 6)
In [2]:
#Load the data
cars = pd.read_csv("data/autos.csv")
In [3]:
#Do basic sanity check.
#1. Look into the first few records
cars.head()
Out[3]:
In [4]:
#2. What are the column names?
cars.columns
Out[4]:
In [5]:
#3. What are the column types?
cars.dtypes
Out[5]:
In [6]:
#4. Do label encoding
from sklearn import preprocessing
In [7]:
cars.columns
Out[7]:
In [10]:
col_object = [x for x in cars.columns if cars[x].dtype == "object"]
col_object
Out[10]:
In [11]:
col_others = [x for x in cars.columns if cars[x].dtype != "object"]
col_others
Out[11]:
In [12]:
le = preprocessing.LabelEncoder()
In [13]:
cars_encoded_object = cars[col_object].apply(le.fit_transform)
In [14]:
cars_encoded_object.dtypes
Out[14]:
In [15]:
cars_encoded = pd.concat([cars_encoded_object, cars[col_others]],axis = 1)
In [16]:
cars_encoded.head()
Out[16]:
In [17]:
cars_encoded.columns
Out[17]:
In [18]:
#5. Ideally, we would do some exploratory analysis.
#For practice, plot: year of registration vs price
In [19]:
plt.scatter(cars_encoded.yearOfRegistration,
cars_encoded.price,
s=20, alpha = 0.05)
plt.xlabel('year')
plt.ylabel('price')
Out[19]:
In [20]:
cars_encoded['log10price'] = np.log10(cars_encoded.price + 1)
In [21]:
plt.scatter(cars_encoded.yearOfRegistration, cars_encoded.log10price,
s=20, alpha = 0.05)
plt.xlabel('year')
plt.ylabel('log10price')
Out[21]:
In [22]:
#Plot of month of registration with price
In [23]:
plt.scatter(cars_encoded.monthOfRegistration, cars_encoded.log10price,
s=20, alpha = 0.05)
plt.xlabel('monthOfRegistration')
plt.ylabel('log10price')
Out[23]:
In [24]:
cars_encoded.head()
Out[24]:
In [25]:
y = cars_encoded.iloc[:,-1]
In [26]:
X = cars_encoded.iloc[:,0:12]
In [27]:
# From sklearn import the linear models
from sklearn import linear_model
In [28]:
model_ols = linear_model.LinearRegression()
In [29]:
model_ols.fit(X, y)
Out[29]:
In [30]:
model_ols.coef_
Out[30]:
In [31]:
model_ols.intercept_
Out[31]:
In [32]:
y_ols = model_ols.predict(X)
In [33]:
from sklearn import metrics
In [34]:
#7. Report the diagnostics. And discuss the results
metrics.mean_squared_error(y_ols,y)
Out[34]:
In [35]:
#8. Build L2 Regression using sklearn - linear_model.Ridge
In [36]:
from sklearn.linear_model import Ridge
In [120]:
model_ridge = Ridge(alpha=10e10)
In [121]:
model_ridge.fit(X, y)
Out[121]:
In [122]:
model_ridge.coef_
Out[122]:
In [123]:
sum(model_ridge.coef_)
Out[123]:
In [108]:
y_ridge = model_ridge.predict(X)
In [42]:
metrics.mean_squared_error(y_ridge,y)
Out[42]:
In [43]:
#9. Try with different values of alpha (0.001, 0.01, 0.05, 0.1, 0,5)
10. The following code is from sklearn official documentation
# Author: Fabian Pedregosa -- <fabian.pedregosa@inria.fr>
# License: BSD 3 clause
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
# X is the 10x10 Hilbert matrix
X = 1. / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis])
y = np.ones(10)
#compute paths
n_alphas = 200
alphas = np.logspace(-10, -2, n_alphas)
clf = linear_model.Ridge(fit_intercept=False)
coefs = []
for a in alphas:
clf.set_params(alpha=a)
clf.fit(X, y)
coefs.append(clf.coef_)
#Display results
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.show()
Can you modify this code to plot for 10 values of alpha and see how weights get changed ?
In [57]:
X_std = preprocessing.StandardScaler().fit_transform(X)
In [136]:
# Author: Fabian Pedregosa -- <fabian.pedregosa@inria.fr>
# License: BSD 3 clause
#compute paths
n_alphas = 20
alphas = np.logspace(-10, 10, n_alphas)
clf = linear_model.Ridge(fit_intercept=False)
coefs = []
for a in alphas:
clf.set_params(alpha=a)
clf.fit(X, y)
coefs.append(clf.coef_)
#Display results
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.show()
In [137]:
#11. Build a L1 Linear Model
from sklearn.linear_model import Lasso
In [138]:
model_lasso = Lasso(alpha=0.1, fit_intercept = False)
In [139]:
model_lasso.fit(X, y)
Out[139]:
In [140]:
model_lasso.coef_
Out[140]:
In [141]:
np.sum(model_lasso.coef_)
Out[141]:
In [142]:
#12. Feature selection from L1 Linear Model
In [143]:
from sklearn.feature_selection import SelectFromModel
In [144]:
Lasso(alpha=0.1, fit_intercept = False, max_iter=10000000)
Out[144]:
In [145]:
sfm = SelectFromModel(model_lasso, threshold=0.1)
In [146]:
sfm
Out[146]:
In [147]:
sfm.fit(X, y)
Out[147]:
In [148]:
featureSel = sfm.transform(X)
In [149]:
featureSel.shape
Out[149]:
In [150]:
featureSel[:,:3]
Out[150]:
In [151]:
#13. Find the generalization error.
#Split dataset into two: Train and Test : 80% and 20%
from sklearn.model_selection import train_test_split
In [152]:
x_train, x_test, y_train, y_test = train_test_split(X, y,test_size=0.2)
In [86]:
x_train.shape, y_train.shape
Out[86]:
In [88]:
#14. Build L2 Regularization model on train and predict on test
In [89]:
model_ridge = Ridge(alpha=1.0, fit_intercept=True, normalize=True)
In [90]:
model_ridge.fit(x_train, y_train)
Out[90]:
In [91]:
ridge_output = model_ridge.predict(x_test)
In [92]:
#15. Report the RMSE
np.sum((y_test - ridge_output)**2)
Out[92]:
In [ ]:
In [ ]: