In [1]:
import numpy as np
import pandas as pd
import utilities as utils
import pylab as pl
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
np.set_printoptions(precision=2, linewidth=120, suppress=True, edgeitems=4)
In [2]:
rawdata = pd.read_csv("world-food-facts/FoodFacts.csv")
In [3]:
not_null_data = rawdata[rawdata.nutrition_score_uk_100g.notnull()]
In [4]:
nutriment_cols = [col for col in not_null_data.columns if '_100g' in col and not 'score' in col]
data = not_null_data[nutriment_cols]
data = data.fillna(0) #fill nulls with 0 for now
data.shape
Out[4]:
In [5]:
target = not_null_data.nutrition_score_uk_100g.loc[data.index.values]
target.shape
Out[5]:
In [6]:
x = data.as_matrix()
y = target.as_matrix()
In [7]:
x = np.array([np.concatenate((v,[1])) for v in x]) #add column of ones to the end of the data set
print x
In [8]:
linreg = LinearRegression()
linreg.fit(x,y)
Out[8]:
In [9]:
p = linreg.predict(x)
p
Out[9]:
In [10]:
err = abs(p-y)
err
Out[10]:
In [11]:
total_error = np.dot(err,err)
rmse_train = np.sqrt(total_error/len(p))
rmse_train
Out[11]:
In [12]:
linreg.coef_ #Regression Coefficients
Out[12]:
In [13]:
pl.plot(p, y,'ro')
pl.plot([-25,50],[-25,50], 'g-')
pl.xlabel('predicted')
pl.ylabel('real')
pl.show()
In [14]:
# RMSE with 10-Fold Cross Validation
kf = KFold(len(x), n_folds=10)
xval_err = 0
for train,test in kf:
linreg.fit(x[train],y[train])
p = linreg.predict(x[test])
e = p-y[test]
print e
xval_err += np.dot(e,e)
rmse_10cv = np.sqrt(xval_err/len(x))
In [15]:
print('Method: Linear Regression')
print('RMSE on training: %.4f' %rmse_train)
print('RMSE on 10-fold CV: %.4f' %rmse_10cv)
whoa! linear regression rmse on 10-fold cross validation is terrible!! let's try something else
In [16]:
print('alpha\t\tridge\t\tlasso\t\telastic-net\n')
alpha = np.linspace(0.01,0.5,50)
for a in alpha:
results = []
for name,met in [
#('linear regression', LinearRegression()),
('ridge', Ridge(fit_intercept=True, alpha=a)),
('lasso', Lasso(fit_intercept=True, alpha=a)),
('elastic-net', ElasticNet(fit_intercept=True, alpha=a))
]:
#met.fit(x,y)
#p = met.predict(x)
#e = p-y
#total_error = np.dot(e,e)
#rmse_train = np.sqrt(total_error/len(p))
kf = KFold(len(x), n_folds=10)
err = 0
for train,test in kf:
met.fit(x[train],y[train])
p = met.predict(x[test])
e = p-y[test]
err += np.dot(e,e)
rmse_10cv = np.sqrt(err/len(x))
results.append(rmse_10cv)
print('{:.3f}\t\t{:.4f}\t\t{:.4f}\t\t{:.4f}\n'.format(a,results[0],results[1],results[2]))
In [17]:
print('Lasso Regression w/ alpha=0.23')
ridge = Lasso(fit_intercept=True, alpha=0.23)
# computing RMSE using 10-fold cross validation
kf = KFold(len(x), n_folds=10)
xval_err = 0
for train, test in kf:
ridge.fit(x[train], y[train])
p = ridge.predict(x[test])
err = p - y[test]
xval_err += np.dot(err,err)
pl.plot(p, y[test],'ro')
pl.plot([-25,50],[-25,50], 'g-')
pl.xlabel('predicted')
pl.ylabel('real')
pl.show()
rmse_10cv = np.sqrt(xval_err/len(x))
print('rsme with 10-fold cross validation = {:.4f}'.format(rmse_10cv))
That's alot better!
In [ ]: