Main imports



In [1]:

    
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split

Regression sample



In [2]:

    
# input data reading
df = pd.read_csv("winequality-white.csv", sep=";")
df.head()









    Out[2]:






  
    
      
      fixed acidity
      volatile acidity
      citric acid
      residual sugar
      chlorides
      free sulfur dioxide
      total sulfur dioxide
      density
      pH
      sulphates
      alcohol
      quality
    
  
  
    
      0
      7.0
      0.27
      0.36
      20.7
      0.045
      45
      170
      1.0010
      3.00
      0.45
      8.8
      6
    
    
      1
      6.3
      0.30
      0.34
      1.6
      0.049
      14
      132
      0.9940
      3.30
      0.49
      9.5
      6
    
    
      2
      8.1
      0.28
      0.40
      6.9
      0.050
      30
      97
      0.9951
      3.26
      0.44
      10.1
      6
    
    
      3
      7.2
      0.23
      0.32
      8.5
      0.058
      47
      186
      0.9956
      3.19
      0.40
      9.9
      6
    
    
      4
      7.2
      0.23
      0.32
      8.5
      0.058
      47
      186
      0.9956
      3.19
      0.40
      9.9
      6



In [3]:

    
# independent variables and dependent variable
x = df.loc[:, : "alcohol"]
y = df["quality"]



In [4]:

    
# features have to be scaled before model learning
scaler = StandardScaler()
x = scaler.fit_transform(x)



In [5]:

    
# spliting data to train and test parts
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)



In [66]:

    
# create and fit LASSO and SVR models 
lasso = Lasso(alpha=0.001)
svr = SVR(C=8, epsilon=0.2, gamma=0.5)
lasso.fit(x_train, y_train)
svr.fit(x_train, y_train)









    Out[66]:





SVR(C=8, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma=0.5,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)



In [67]:

    
# make prediction on test data and rounding results
y_pred_lasso = np.round(np.clip(lasso.predict(x_test), 1, 10)).astype(int)
y_pred_svr = np.round(np.clip(svr.predict(x_test), 1, 10)).astype(int)



In [68]:

    
np.round(1 - mean_squared_error(y_test, y_pred_lasso) / y_test.std(), 2)









    Out[68]:





0.28000000000000003



In [69]:

    
np.round(1 - mean_squared_error(y_test, y_pred_svr) / y_test.std(), 2)









    Out[69]:





0.5

This LASSO model describes just 28% of initial data dispersion. With same data SVR describes 50%. Here SVR build nonlinear decision boundary using so-called "kernel trick"

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality
0	7.0	0.27	0.36	20.7	0.045	45	170	1.0010	3.00	0.45	8.8	6
1	6.3	0.30	0.34	1.6	0.049	14	132	0.9940	3.30	0.49	9.5	6
2	8.1	0.28	0.40	6.9	0.050	30	97	0.9951	3.26	0.44	10.1	6
3	7.2	0.23	0.32	8.5	0.058	47	186	0.9956	3.19	0.40	9.9	6
4	7.2	0.23	0.32	8.5	0.058	47	186	0.9956	3.19	0.40	9.9	6