In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
In [2]:
# input data reading
df = pd.read_csv("winequality-white.csv", sep=";")
df.head()
Out[2]:
In [3]:
# independent variables and dependent variable
x = df.loc[:, : "alcohol"]
y = df["quality"]
In [4]:
# features have to be scaled before model learning
scaler = StandardScaler()
x = scaler.fit_transform(x)
In [5]:
# spliting data to train and test parts
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
In [66]:
# create and fit LASSO and SVR models
lasso = Lasso(alpha=0.001)
svr = SVR(C=8, epsilon=0.2, gamma=0.5)
lasso.fit(x_train, y_train)
svr.fit(x_train, y_train)
Out[66]:
In [67]:
# make prediction on test data and rounding results
y_pred_lasso = np.round(np.clip(lasso.predict(x_test), 1, 10)).astype(int)
y_pred_svr = np.round(np.clip(svr.predict(x_test), 1, 10)).astype(int)
In [68]:
np.round(1 - mean_squared_error(y_test, y_pred_lasso) / y_test.std(), 2)
Out[68]:
In [69]:
np.round(1 - mean_squared_error(y_test, y_pred_svr) / y_test.std(), 2)
Out[69]:
This LASSO model describes just 28% of initial data dispersion. With same data SVR describes 50%. Here SVR build nonlinear decision boundary using so-called "kernel trick"