In [14]:
from sklearn import datasets
import numpy as np
import pandas as pd
import bokeh
from bokeh.plotting import output_notebook

from datascienceutils import analyze
from datascienceutils import predictiveModels as pm

output_notebook(bokeh.resources.INLINE)
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
# Use only one feature
diabetes_X = diabetes.data[:, np.newaxis, 2]

# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

# Split the targets into training/testing sets
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]


Loading BokehJS ...

In [15]:
df = pd.DataFrame(diabetes.data)
target = diabetes.target
analyze.correlation_analyze(df)


# Correlation btw Numerical Columns
# Pandas correlation coefficients matrix
          0         1         2         3         4         5         6  \
0  1.000000  0.173737  0.185085  0.335427  0.260061  0.219243 -0.075181   
1  0.173737  1.000000  0.088161  0.241013  0.035277  0.142637 -0.379090   
2  0.185085  0.088161  1.000000  0.395415  0.249777  0.261170 -0.366811   
3  0.335427  0.241013  0.395415  1.000000  0.242470  0.185558 -0.178761   
4  0.260061  0.035277  0.249777  0.242470  1.000000  0.896663  0.051519   
5  0.219243  0.142637  0.261170  0.185558  0.896663  1.000000 -0.196455   
6 -0.075181 -0.379090 -0.366811 -0.178761  0.051519 -0.196455  1.000000   
7  0.203841  0.332115  0.413807  0.257653  0.542207  0.659817 -0.738493   
8  0.270777  0.149918  0.446159  0.393478  0.515501  0.318353 -0.398577   
9  0.301731  0.208133  0.388680  0.390429  0.325717  0.290600 -0.273697   

          7         8         9  
0  0.203841  0.270777  0.301731  
1  0.332115  0.149918  0.208133  
2  0.413807  0.446159  0.388680  
3  0.257653  0.393478  0.390429  
4  0.542207  0.515501  0.325717  
5  0.659817  0.318353  0.290600  
6 -0.738493 -0.398577 -0.273697  
7  1.000000  0.617857  0.417212  
8  0.617857  1.000000  0.464670  
9  0.417212  0.464670  1.000000  
# Pandas co-variance coefficients matrix
          0         1         2         3         4         5         6  \
0  0.002268  0.000394  0.000420  0.000761  0.000590  0.000497 -0.000170   
1  0.000394  0.002268  0.000200  0.000547  0.000080  0.000323 -0.000860   
2  0.000420  0.000200  0.002268  0.000897  0.000566  0.000592 -0.000832   
3  0.000761  0.000547  0.000897  0.002268  0.000550  0.000421 -0.000405   
4  0.000590  0.000080  0.000566  0.000550  0.002268  0.002033  0.000117   
5  0.000497  0.000323  0.000592  0.000421  0.002033  0.002268 -0.000445   
6 -0.000170 -0.000860 -0.000832 -0.000405  0.000117 -0.000445  0.002268   
7  0.000462  0.000753  0.000938  0.000584  0.001229  0.001496 -0.001675   
8  0.000614  0.000340  0.001012  0.000892  0.001169  0.000722 -0.000904   
9  0.000684  0.000472  0.000881  0.000885  0.000739  0.000659 -0.000621   

          7         8         9  
0  0.000462  0.000614  0.000684  
1  0.000753  0.000340  0.000472  
2  0.000938  0.001012  0.000881  
3  0.000584  0.000892  0.000885  
4  0.001229  0.001169  0.000739  
5  0.001496  0.000722  0.000659  
6 -0.001675 -0.000904 -0.000621  
7  0.002268  0.001401  0.000946  
8  0.001401  0.002268  0.001054  
9  0.000946  0.001054  0.002268  

In [16]:
# Train the model using the training sets
lin_model = pm.train(diabetes_X_train, diabetes_y_train, 'linearRegression')

print('Coefficients: \n', lin_model.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((lin_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % lin_model.score(diabetes_X_test, diabetes_y_test))


Coefficients: 
 [ 938.23786125]
Mean squared error: 2548.07
Variance score: 0.47

In [17]:
# Train the model using the training sets
log_model = pm.train(diabetes_X_train, diabetes_y_train, 'logisticRegression')

#print('Coefficients: \n', log_model.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((log_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % log_model.score(diabetes_X_test, diabetes_y_test))


Mean squared error: 10277.60
Variance score: 0.00

In [18]:
# Train the model using the training sets
rf_model = pm.train(diabetes_X_train, diabetes_y_train, 'randomForest')

#print('Coefficients: \n', rf_model.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((rf_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % rf_model.score(diabetes_X_test, diabetes_y_test))


Mean squared error: 8553.00
Variance score: 0.00

In [19]:
# Train the model using the training sets
sgd_model = pm.train(diabetes_X_train, diabetes_y_train, 'sgd')
sgd_model.fit(diabetes_X_train, diabetes_y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((sgd_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % sgd_model.score(diabetes_X_test, diabetes_y_test))


Mean squared error: 8218.40
Variance score: 0.00

In [20]:
# Train the model using the training sets
xgb_model = pm.train(diabetes_X_train, diabetes_y_train, 'xgboost')
xgb_model.fit(diabetes_X_train, diabetes_y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((xgb_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % xgb_model.score(diabetes_X_test, diabetes_y_test))


Mean squared error: 4906.90
Variance score: 0.05

In [21]:
# Train the model using the training sets
svm_model = pm.train(diabetes_X_train, diabetes_y_train, 'svm')
svm_model.fit(diabetes_X_train, diabetes_y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((svm_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % svm_model.score(diabetes_X_test, diabetes_y_test))


Mean squared error: 10277.60
Variance score: 0.00

In [22]:
# Train the model using the training sets
bnb_model = pm.train(diabetes_X_train, diabetes_y_train, 'bernoulliNB')
bnb_model.fit(diabetes_X_train, diabetes_y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((bnb_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % bnb_model.score(diabetes_X_test, diabetes_y_test))


Mean squared error: 8755.40
Variance score: 0.00

In [23]:
# Train the model using the training sets
knn_model = pm.train(diabetes_X_train, diabetes_y_train, 'knn')
knn_model.fit(diabetes_X_train, diabetes_y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((knn_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % knn_model.score(diabetes_X_test, diabetes_y_test))


Mean squared error: 5640.65
Variance score: 0.00

In [24]:
# Train the model using the training sets
kde_model = pm.train(diabetes_X_train, diabetes_y_train, 'kde')
kde_model.fit(diabetes_X_train, diabetes_y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((kde_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % kde_model.score(diabetes_X_test, diabetes_y_test))


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-24-0ccdb43be70d> in <module>()
      4 # The mean squared error
      5 print("Mean squared error: %.2f"
----> 6       % np.mean((kde_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
      7 # Explained variance score: 1 is perfect prediction
      8 print('Variance score: %.2f' % kde_model.score(diabetes_X_test, diabetes_y_test))

AttributeError: 'KernelDensity' object has no attribute 'predict'

In [ ]:
# Train the model using the training sets
kde_model = pm.train(diabetes_X_train, diabetes_y_train, 'kde')
kde_model.fit(diabetes_X_train, diabetes_y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((kde_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % kde_model.score(diabetes_X_test, diabetes_y_test))

In [ ]:
# Train the model using the training sets
mnb_model = pm.train(diabetes_X_train, diabetes_y_train, 'multinomialNB')

print('Coefficients: \n', mnb_model.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((mnb_model.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % mnb_model.score(diabetes_X_test, diabetes_y_test))

In [ ]:
X, y = datasets.load_diabetes(return_X_y=True)
X.shape

Linear Regression is top with MSE: 2548.07

But we know this is a linear regression data set in the first place

Of the non-linear models

Clearly xgboost takes the cake with MSE: 4906 runs in 5.94s

Followed by knn MSE: 5640.65

I heard about lightgbm and wanted to try it.

So check it out MSE: 5066.17 and runs in 194ms

Wow that's multiple orders of magnitude faster and only about 10% more error.. May be lightgbm will work very well for linear patterns. Need to check for other patterns and if it keeps similar trade-offs, then it'll change the market