Import important files


In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [10]:
# Showing matplotlib plots in jupyter notebook
%matplotlib inline

Notes Before last version


In [11]:
# Using patientId for better intitution
# Drawing polar heatmap
# Learning curve with smaller numbers
# How we can use SVR for Regression
# Making module for finding best functino of sklearn

Read Data


In [12]:
# Getting Training dataset
# Import dataset as Dataframe
df_full = pd.read_csv("../Dataset/slice_localization_data.csv", sep=',')

# Import dataset as numpy array
X_full = np.genfromtxt("../Dataset/slice_localization_data.csv", delimiter=',', skip_header=1)

# Making distnict output column for later uses
y = X_full[:,385]

# Removing first and last column from the dataset
X = X_full[:,1:385]

# Converting X to Dataframe
df = pd.DataFrame(X)
df.columns = df_full.columns[1:385]

# Getting size of the Training Dataset
m,n = X.shape

In [13]:
# Getting shape of the Dataset
X_full.shape


Out[13]:
(53500, 386)

In [14]:
# Getting first 10 rows of the numpy array
X_full[0:10]


Out[14]:
array([[  0.      ,   0.      ,   0.      , ...,  -0.25    ,  -0.25    ,
         21.803851],
       [  0.      ,   0.      ,   0.      , ...,  -0.25    ,  -0.25    ,
         21.745726],
       [  0.      ,   0.      ,   0.      , ...,  -0.25    ,  -0.25    ,
         21.6876  ],
       ..., 
       [  0.      ,   0.      ,   0.      , ...,  -0.25    ,  -0.25    ,
         21.396971],
       [  0.      ,   0.      ,   0.      , ...,  -0.25    ,  -0.25    ,
         21.28072 ],
       [  0.      ,   0.      ,   0.      , ...,  -0.25    ,  -0.25    ,
         22.617612]])

In [15]:
# Getting first 10 rows of the dataframe
df_full.head(10)


Out[15]:
patientId value0 value1 value2 value3 value4 value5 value6 value7 value8 ... value375 value376 value377 value378 value379 value380 value381 value382 value383 reference
0 0 0.0 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 -0.25 ... -0.25 0.980381 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 21.803851
1 0 0.0 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 -0.25 ... -0.25 0.977008 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 21.745726
2 0 0.0 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 -0.25 ... -0.25 0.977008 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 21.687600
3 0 0.0 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 -0.25 ... -0.25 0.977008 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 21.629474
4 0 0.0 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 -0.25 ... -0.25 0.976833 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 21.571348
5 0 0.0 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 -0.25 ... -0.25 0.953202 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 21.513223
6 0 0.0 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 -0.25 ... -0.25 0.000000 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 21.455097
7 0 0.0 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 -0.25 ... -0.25 0.867572 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 21.396971
8 0 0.0 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 -0.25 ... -0.25 0.930170 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 21.280720
9 0 0.0 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 -0.25 ... -0.25 0.990034 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 22.617612

10 rows × 386 columns


In [16]:
# Getting some intuition of data
df_full.describe()


Out[16]:
patientId value0 value1 value2 value3 value4 value5 value6 value7 value8 ... value375 value376 value377 value378 value379 value380 value381 value382 value383 reference
count 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 ... 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000
mean 47.075701 0.059627 0.071558 0.145819 0.218728 0.274762 0.276189 0.204531 0.062281 -0.042025 ... -0.029404 0.182913 0.320112 0.359373 0.342889 0.266091 0.083049 -0.031146 -0.154524 47.028039
std 27.414240 0.174243 0.196921 0.300270 0.359163 0.378862 0.369605 0.351294 0.292232 0.268391 ... 0.085817 0.383333 0.463517 0.478188 0.471811 0.437633 0.279734 0.098738 0.122491 22.347042
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -0.250000 -0.250000 -0.250000 -0.250000 ... -0.250000 0.000000 0.000000 0.000000 0.000000 0.000000 -0.250000 -0.250000 -0.250000 1.738733
25% 23.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -0.250000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -0.250000 29.891607
50% 46.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -0.250000 43.987893
75% 70.000000 0.000000 0.000000 0.000000 0.446429 0.684477 0.662382 0.441412 0.000000 0.000000 ... 0.000000 0.000000 0.996286 0.999677 0.999560 0.949478 0.000000 0.000000 0.000000 63.735059
max 96.000000 1.000000 1.000000 1.000000 1.000000 0.998790 0.996468 0.999334 1.000000 1.000000 ... 0.961279 1.000000 1.000000 1.000000 1.000000 1.000000 0.999857 0.996839 0.942851 97.489115

8 rows × 386 columns


In [17]:
# Gettign unqiue values of the "Patient ID"
np.unique(df_full["patientId"])


Out[17]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96], dtype=int64)

Functions Section


In [18]:
# Import mean_squared_error function from Sklearn Library
from sklearn.metrics import mean_squared_error

def plotLearningCurves(X,y,step):
    
    m,n = X.shape
    maxVal = (int)(m / 10) * 10
    N_size_arr = np.arange(10, maxVal + 10, step)
    error_arr = np.zeros(( len(np.arange(10, maxVal + 10, step)) ,2 ))
    index = 0
    
    # Increasing train dataset size, "step" times in each iteration
    for i in N_size_arr:
        
        # Splitting Training dataset with size i into train and cross validation sets
        X_train, X_test, y_train, y_test = train_test_split(X[:i,:], y[:i], test_size=0.33, random_state=42)
        
        # Fitting Model
        lm.fit(X_train, y_train)
        
        # Computing both mean squared error of training dataset and cross validation datasets predections
        error_arr[index,0] = mean_squared_error(y_train , lm.predict(X_train))
        error_arr[index,1] = mean_squared_error(y_test, lm.predict(X_test))
        
        # Increasing index with 1
        index += 1
    
    # Initializing the figure
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_axes([0,0,1,1])
    ax.set_yscale('log')
    
    # Plotting "Training set size" vs. "Mean Squared Error" for both of the train and cross validation dataset's errors
    line1, = ax.plot(N_size_arr,error_arr[:,0], c='red')
    line2, = ax.plot(N_size_arr,error_arr[:,1], c='blue')
    
    # Adding labels && legends to our plot
    ax.set_xlabel("N (Training set size)")
    ax.set_ylabel("Mean Squared Error")
    
    ax.legend((line1,line2),("Train Error","Test Error"))

Data analysis


In [19]:
# Plot "Means of each column in dataset( Attributes ) except first and last column"  distplot
sns.distplot(df.mean())

# ===> We can conclude all of our attributes are in [-1,1] range, so we don't need to use feature normalize technique


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x853ff90d30>

In [20]:
fig = plt.figure(figsize=(12,10))

axes1 = fig.add_axes([0, 2, 1, 1], projection='polar')
axes2 = fig.add_axes([1, 2,1,1], projection='polar')
axes3 = fig.add_axes([0, 1, 1, 1], projection='polar')
axes4 = fig.add_axes([1, 1,1,1], projection='polar')

# Plotting first example of bone structure of the one person
axes1.plot(X[150,1:241], 'bo', ms=10)
axes1.set_xlabel("Bone Structure")

# Plotting first example of air inclusion of the one person
axes2.plot(X[150,241:386], 'ro', ms=10)
axes2.set_xlabel("Air Inclusion")

# Plotting second example of bone structure of the one person
axes3.plot(X[3541,1:241], 'bo', ms=10)
axes3.set_xlabel("Bone Structure")

# Plotting second example of air inclusion of the one person
axes4.plot(X[3541,241:386], 'ro', ms=10)
axes4.set_xlabel("Air Inclusion")


Out[20]:
<matplotlib.text.Text at 0x85392e33c8>

In [21]:
# Plot "Reference" column distplot
plt.figure(figsize=(12,8))
sns.distplot(y, bins=100)

# ==> We can see that we don't have any image for locations of body with value bigger than 100, and majority of the
# images are taken with values in [25,40] range


Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x853bb86160>

Predection Section


In [22]:
X.shape


Out[22]:
(53500, 384)

In [23]:
y.shape


Out[23]:
(53500,)

In [24]:
# Using SkLearn Library "Linear Regression" Method
from sklearn.linear_model import LinearRegression

# Initialization Model
lm = LinearRegression()

In [25]:
# Splitting Data into Training and Cross Validation Datasets
# Using Sklearn library for splitting data
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)


C:\Users\Danietzio\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [26]:
# Fitting Model with Train dataset ( Columns except "PatientId", "Reference")
lm.fit(X_train,y_train)


Out[26]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [27]:
# Model Thetas
lm.coef_


Out[27]:
array([ -2.49139644e+00,  -8.69497759e-01,   9.24385440e-01,
         6.62012145e+00,   6.42405712e+00,   3.29487223e+00,
         8.78259130e-01,   3.23571134e+00,  -5.55753081e+00,
        -9.17715543e-01,  -1.69933517e+00,   7.63349941e-02,
         3.51426894e-01,   4.42502516e-01,  -2.99870138e-01,
         3.65831336e-01,   1.07190094e+00,   6.17675949e-01,
        -3.23179052e+00,   3.59100341e+00,   1.69590241e+00,
        -4.87804896e-01,  -2.08363544e+00,   1.31187654e+00,
        -7.91772202e-01,  -5.14274490e-01,  -7.51391761e-01,
        -9.40538410e+00,   1.10672289e+01,   3.89230047e+00,
         5.89338084e-01,   8.42582554e-01,  -8.38247461e-01,
         3.45091308e-01,   5.93980190e-01,  -3.47319519e+00,
        -4.27445811e+00,  -5.78937940e-01,  -5.57114137e+00,
         1.25405079e+00,  -3.80578414e+00,  -1.00349029e+00,
        -3.04525502e+00,   5.39976900e-02,  -2.39727413e+00,
        -1.92287396e+00,  -2.35967959e-01,   2.05902910e+00,
         2.40929366e+13,   7.64767555e+12,  -3.02840588e+00,
         7.46169872e-01,  -1.28493065e+00,  -1.21671433e+00,
        -3.12624409e-01,  -1.83113386e+00,   1.27002833e+00,
        -5.24887151e+00,  -7.31673243e+00,  -1.78773285e+12,
        -2.31417879e-01,   1.70438096e+00,  -3.75049629e-01,
        -2.95822088e+00,  -2.82389455e+00,  -2.15940207e+00,
        -2.76272857e+00,   4.92988949e+00,   4.65814689e+00,
        -1.56717591e+12,   1.21568880e+00,   2.01366810e+00,
         2.39857854e+00,   4.19334127e-01,   1.82824677e+00,
         1.65955812e+00,  -3.88685965e+00,  -8.39860206e+00,
        -2.40929366e+13,  -7.64767555e+12,   8.53463422e-01,
        -1.83045438e+00,  -1.42406102e+00,   5.02939469e-02,
        -1.58933153e-01,  -1.24758944e+00,  -5.02475387e-01,
         4.87069649e+00,   9.24792557e-01,  -6.32266424e+00,
         1.02663821e+00,  -1.68896754e+00,   1.14282888e+00,
        -6.38146996e-01,   8.44215733e-01,   5.08413705e-01,
        -1.78708864e+00,   9.85034978e+00,   1.89526049e+01,
         1.45151874e+01,  -1.02847512e+00,  -2.05870634e+00,
         1.64076954e+00,   2.00666033e-01,   7.20281244e-01,
         1.07582834e+00,   3.83371713e+00,   1.47878423e+00,
        -3.50697200e+00,   4.06742493e-01,  -3.75842269e+00,
        -3.34627399e-01,  -7.94629849e-01,   7.89308402e-01,
         5.05005881e+00,   3.00721873e+00,   1.75354535e+00,
         6.54587875e-02,  -6.07755595e+00,  -9.58380083e+00,
        -4.51227986e+00,   1.79619506e+00,   1.38762830e+00,
        -1.11675286e+00,  -1.95215528e-01,  -4.46430042e-01,
        -1.01762836e+00,  -4.90434385e-01,  -2.46614388e+00,
         2.66611610e+00,  -1.37647967e+00,   9.19185319e-01,
         4.12840641e+00,  -1.93961937e+00,   9.62455171e-02,
        -3.44545716e+00,  -1.91869019e+00,  -4.04060392e+00,
        -1.89344418e+00,  -2.34833230e+00,   2.43700677e-01,
        -6.81411001e-01,   1.76999030e+00,   2.05834954e+00,
         9.47026006e-01,  -1.57526265e+00,  -1.50836248e+00,
         1.05008774e+00,   1.20334452e+00,  -7.36290933e-01,
        -2.48729200e+00,  -2.17372204e-01,  -1.72407074e+00,
         1.75942564e+00,  -4.29781897e-01,   4.22466593e-01,
        -1.76424428e+00,   5.70585784e-02,   1.68115256e+00,
         2.38964324e+00,  -5.48876505e-01,   5.09385654e-01,
        -5.28876561e-03,  -1.26062911e+00,   1.03249402e-01,
        -3.27606799e-01,   1.20136955e+00,  -2.37390600e+00,
         1.99007954e+00,  -1.79947697e+12,  -2.51057925e+00,
         5.39038268e-03,  -2.29575901e-01,  -5.40080274e-01,
        -2.43405514e+00,   1.54854318e+00,  -3.82061390e+00,
        -5.67866499e-01,  -1.52594609e+11,  -1.86097702e+11,
        -1.88487776e+00,  -3.12075466e-01,   2.28495997e+00,
         1.45810689e+00,  -1.00238233e-01,  -1.82567682e+00,
        -1.03564647e-01,  -4.01340042e+00,   1.52594609e+11,
        -1.59672034e+11,  -1.71662085e-01,   2.68049324e+00,
        -1.58635237e-01,  -1.06797034e+00,   3.87919947e-01,
        -1.90020933e+00,  -1.38165666e+00,   1.83603724e+00,
        -9.90663673e-01,   1.79947697e+12,  -8.31048668e-01,
        -4.49540231e-01,  -1.51146070e+00,  -1.28526709e+00,
        -2.59604957e-01,   2.85563774e-01,  -1.69227694e+00,
         1.55269077e+00,   4.56701133e-01,   3.76849632e-01,
        -1.91458902e+00,  -2.92021795e+00,   2.30222746e+00,
         1.22653657e+00,   7.59727889e-01,  -1.86165009e+00,
        -2.96483101e-01,  -2.43002973e+00,  -3.37867719e+00,
         1.40224303e-01,  -1.82232050e+00,   1.05350968e-01,
         3.57860806e+00,  -9.49098202e-01,  -1.98310114e+00,
        -3.39329335e+00,  -2.78765940e+00,  -3.25634706e+00,
        -3.30424687e+00,   4.19654650e+00,  -2.58865917e+00,
         6.68237559e-01,  -2.17007799e+00,   4.64827510e-01,
        -6.60295172e-01,   5.13933061e-01,  -1.20241829e+00,
        -3.52026591e+00,   1.96907532e+00,   1.08473219e+00,
         3.21247668e-01,   2.98323206e-01,  -2.53328599e+00,
         4.41584666e-02,  -6.94276797e-01,  -1.79061197e-02,
         1.69862841e+01,  -1.20302516e+01,  -7.91865986e-01,
         1.15307801e+00,  -5.60941443e-01,   1.82852235e+00,
         1.45005318e+00,   7.15954274e-01,  -1.91101345e+01,
         2.70094907e-01,   2.80762452e-02,  -1.03779321e+00,
        -2.64925205e-02,   3.14016966e-01,   3.00345634e-01,
        -2.63706836e-01,   5.15326447e+00,   1.11614771e+00,
        -1.70415347e+00,  -7.59155240e-01,   1.52691429e+00,
         2.29132262e-01,   1.16583636e+00,   1.95544752e+00,
        -3.54843662e+00,   1.15491786e+12,  -9.34081205e-01,
        -4.97381602e+00,  -2.02276697e+00,  -7.59189901e-01,
         2.52376438e-01,  -3.71735077e+00,   2.78257244e+00,
        -9.78950961e+00,  -9.89492103e-01,  -1.28198642e+00,
         4.75202494e-01,   7.94580733e-01,  -2.55015760e-01,
         1.89247225e-01,   4.06032824e+00,  -1.15491786e+12,
         1.88293172e+00,   8.78387911e-01,  -3.95531286e-02,
         1.07337961e+00,   1.29379368e-01,   2.16743534e+00,
         6.45827481e+00,   1.50418650e+00,   9.96240341e-01,
        -1.39106242e+00,  -3.83620675e-01,   2.15893836e-01,
         1.23155795e+00,   5.84703457e-02,  -2.45713237e+00,
        -3.74744777e+00,  -1.74112312e-01,  -1.84108023e-01,
         8.41178657e-01,   1.56226816e+00,  -4.72601244e-02,
        -6.37956140e-02,   8.20889187e+00,   9.20855720e+00,
        -1.87929198e-01,  -1.08199184e+00,  -2.18851157e+00,
         1.50152084e+00,   1.04881007e+00,   1.25676152e+00,
         8.00823186e-01,   4.54454666e+00,  -6.16253690e-01,
         1.23379083e+00,  -1.06513798e+00,  -2.82781086e+00,
         8.52455747e-01,  -6.12104802e-01,   1.23602662e+00,
        -3.77366610e+00,  -6.06277656e-01,  -1.01648290e+00,
        -6.57410728e-01,   5.15356659e-02,  -8.37834528e-02,
         9.33323282e-01,   9.75698977e-01,   1.60999119e+01,
        -6.76887026e-01,   3.83724313e-02,  -9.73070573e-01,
        -1.53679424e+00,   2.25893447e+00,   5.70659997e+00,
        -5.76431091e+00,  -6.76112093e+11,   9.39291231e-01,
         1.05380389e-01,   1.09836955e-01,   2.68028740e+00,
         1.15890375e-01,   2.16729178e+00,   3.56678901e-01,
         1.66181605e+09,   2.63546480e-01,  -3.98429128e-01,
        -1.62373434e-01,  -4.31590128e-01,   7.76861210e-01,
         3.67767608e-01,  -4.93437066e+00,   6.76112093e+11,
        -1.74274564e-01,   5.48986200e-01,  -3.43278254e+00,
        -1.41548161e+00,   1.91442591e+00,   8.33643186e-02,
        -2.72351738e+00,  -1.86789723e+01,  -4.71399312e-01,
        -1.46130203e-01,  -1.03484869e+00,  -1.07678057e+00,
         8.70736585e-01,  -8.20396077e-01,  -5.71111702e+00,
         2.41724076e+00,   1.24450984e-01,  -3.63836904e-01,
        -1.58250963e+00,   1.16640127e+00,   7.09022751e-01,
         1.81274474e+00,   4.05353730e+00,   5.36275010e+00])

In [28]:
# Predecting Reference values with Cross validation dataset
pred = lm.predict(X_test)

# Evaluating error with squared error method without using SkLearn Library
eval_err = np.sum( ( pred - y_test ) ** 2 , axis=0 ) / len(pred)
eval_err


Out[28]:
68.254364896165498

In [29]:
# Plot predections vs. y_test for better understanding how our model works!
fig = plt.figure(figsize=(12,8))
ax = fig.add_axes([0,0,1,1])

ax.set_xlabel("Predections")
ax.set_ylabel("Test Target Varible")
ax.plot(y_test, pred,'bo',ms=1)


Out[29]:
[<matplotlib.lines.Line2D at 0x853c0af390>]

In [30]:
plotLearningCurves(X,y,1000)


Trying for getting better predections


In [31]:
# Checking error on Train set value and Comparing it with Test dataset error
pred_train = lm.predict(X_train)
mean_squared_error(pred_train,y_train)

# ===> We can see both of the MSE on Training Dataset && Cross validation Dataset is high, so maybe algorithm suffers from High bias


Out[31]:
1.5484293731660227e+18

In [32]:
# Normalize features but maybe it doesn't help, but it can help for decreasing time of the fitting algorithm
lm.normalize = True
lm.fit(X_train,y_train)


Out[32]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [33]:
pred_train = lm.predict(X_train)
mean_squared_error(pred_train, y_train)

# ===> We can see normalizing doesn't help


Out[33]:
67.796342478357161

In [34]:
# Using other regression model of sklearn
from sklearn.linear_model import SGDRegressor

sg = SGDRegressor(alpha=0,max_iter=10000,n_iter=10000,learning_rate='constant',eta0=0.0001)


C:\Users\Danietzio\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\linear_model\stochastic_gradient.py:73: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.
  DeprecationWarning)

In [35]:
# Fitting Model
sg.fit(X_train,y_train)


Out[35]:
SGDRegressor(alpha=0, average=False, epsilon=0.1, eta0=0.0001,
       fit_intercept=True, l1_ratio=0.15, learning_rate='constant',
       loss='squared_loss', max_iter=10000, n_iter=None, penalty='l2',
       power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)

In [36]:
# Predecting Using new model ( SGDRegressor )
pred = sg.predict(X_test)

In [37]:
# Evaluating model
mean_squared_error(y_test,pred)


Out[37]:
68.293097049145004

In [38]:
# Plot predections vs. y_test for better understanding how our model works!
fig = plt.figure(figsize=(12,8))
ax = fig.add_axes([0,0,1,1])

ax.set_xlabel("Predections")
ax.set_ylabel("Test Target Varible")
ax.plot(y_test, pred,'bo',ms=1)


Out[38]:
[<matplotlib.lines.Line2D at 0x853ee22320>]

In [ ]:
# Because train Error && test Error are very close to each other and they are almost high so we can conclude we are suffering \
# from high bias, and we should add extra features

# In the last try, We saw that more iterations doesn't help so much \
# So we try more features

# Function for finding best degree and alplha
# Before running this function we should import following functions
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn import svm

# Version 1.0.0
def findBestRegressorModel(X,y,sample_d= (1,10) ,sample_alpha= None, _kernel= 'rbf'):
    
    # sample_d should be tuple with size of 2
    if( type(sample_d) == tuple ):
        if( len(sample_d) != 2):
            raise ValueError("sample_d length should be 2 !!!")
        else:
            sample_d = np.arange(sample_d[0], sample_d[1] + 1, 1)
    else:
        raise ValueError("sample_d should be tuple !!!")
    
    # Finding which trainer model is better to use
    trainer = findBestTrainer(X)
    
    # Defining default value of "lambda of regularization part in loss function ( alpha )"
    if( sample_alpha == None ):
        sample_alpha = np.array([0.0001,0.001,0.003,0.01,0.03,0.1,0.3,1,3,10])
    
    # Container of the model error in each degree ( We'll use last two column for holding lambda and degree values \
    # in each iteration )
    error_tracker = np.zeros(( len(sample_d), 4))
        
    # Spliting data into Train, Cross Validation, Test dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # Creating artificial new features with degrees defined in sample_d array
    # Container of our artificial created new features
    A_train = extraFeatureCreature(X_train.copy(), sample_d)
    
    for deg in sample_d:
        
        # Spliting data into Train, Cross Validation dataset
        X_train, X_cross, y_train, y_cross = train_test_split(A_train[deg - 1], y, test_size=0.2)
        
        if( trainer == "SGDRegressor" ):
            
            # It's better to use SGDRegressor
            for lam in sample_alpha:
                sg = SGDRegressor(alpha=lam, max_iter=1000, n_iter=1000)
                sg.fit(X_train,y_train)
                
                pred_train = sg.predict(X_train)
                pred_cross = sg.predict(X_cross)
                
                MSE_train = mean.squared_error(pred_train, y_train)
                MSE_cross = mean_squared_error(pred_cross, y_cross)
                
                error_tracker[deg - 1][0] = MSE_train
                error_tracker[deg - 1][1] = MSE_cross
                error_tracker[deg - 1][2] = lam
                error_tracker[deg - 1][3] = deg
                
        elif ( trainer == "SVR_Ridge" ):
            
            # It's better to use "Linear SVR Regressor" or "rbf SVR Regressor"
            # We use "rbf" kernel because it usually give better predections, but obviously since it's more complicated model \
            # it takes much more time
            
            # Initializing SVR Model
            clf = svm.SVR(kernel= _kernel, max_iter=1000)
            
            # Fitting model with training set
            clf.fit(X_train,y_train)
            
            # Predecting outputs
            pred_train = clf.predict(X_train)
            pred_cross = clf.predict(X_cross)
            
            # Evaluating model
            MSE_train = mean.squared_error(pred_train, y_train)
            MSE_cross = mean_squared_error(pred_cross, y_cross)

            # Tracking error in each step
            # Note: Because we don't use any lambda in SVR model, so we don't need to add any \
            # values to third column of the error_tracker array
            error_tracker[deg - 1][0] = MSE_train
            error_tracker[deg - 1][1] = MSE_cross
            error_tracker[deg - 1][3] = deg
        
        else:
            
            # In later Versions of the module, we will use one of the \
            # SkLearn "Lassor" or "ElasticNet" Regressors if prior conditions are passed
            # but for now, we give up this with informing user to use one of the "Lasso" or "ElasticNet" Regressors
            print("It's better to use 'Lasso' or 'ElasticNet' Regressors!")
            
    # Creating extra features for test dataset
    A_test = extraFeatureCreature(X_test.copy(), sample_d)
        
    # Plot "Degree of model" vs. "Train and Test Error" for finding best possible model visually
    twoPlotsInOne(X1= error_tracker[:,3], \
                  y1= error_tracker[:,0], y2= error_tracker[:,1], \
                  X_label= "D (Degree of the model)", y_label= "Train && Test Error", \
                  first_legend= "Train Error", second_legend= "Test Error")
        
    # Checking which model has been used
    if( trainer == "SGDRegressor" ):
        
        # SkLearn SGDRegressor Model has been used
        # Finding best lambda and Degree
        best_lam = error_tracker[np.argmin(error_tracker[:,1])][2]
        best_deg = error_tracker[np.argmin(error_tracker[:,1])][3]

        sg = SGDRegressor(alpha=best_lam, max_iter=1000, n_iter=1000)
        sg.fit(A_train[best_deg - 1],y_train)
        pred = sg.predict(A_test[best_deg - 1])
        
    elif( trainer == "SVR_Ridge" ):
        
        # "SVR" or "Ridge" Regressors has been used !!!
        # Finding best Degree
        best_deg = error_tracker[np.argmin(error_tracker[:,1])][3]
        best_lam = None
        
        # Initializing SVR Model
        clf = svm.SVR(kernel= _kernel, max_iter=1000)
        clf.fit(A_train[best_deg - 1],y_train)
        pred = clf.predict(A_test[best_deg - 1])
        
    else:
        
        # "Lasso" or "ElasticNet" has been used !!!
        # In Later versions of this module, we will implement this section also \
        return "It's better to use 'Lasso' or 'ElasticNet' Regressors!"
        
    # Checking how much good do our model generalize !!
    test_error = mean_squared_error(y_test,pred)

    return { "Degree" : best_deg, "Lambda": best_lam, "Test Error": test_error }

def findBestTrainer(X):
    
    # This threshhold is inspired by Sklearn flow chart
    # Link for more information : http://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
    if( len(X) > 100000 ):
        return "SGDRegressor"
    else:
        
        # Checking how many features is important !!!
        # This proof is inspired by Sklearn flow chart also.
        # Link for more information : http://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
        # In later versions of this function we should make better logic for this.
        if(X.shape[1] < 100):
            
            # It's better to use SkLearn "Lasso" or "ElasticNet" Regressors
            return "Lasso_ElasticNet"
        else:
            
            # It's better to use SkLearn "SVR" or "RidgeRegressors" Regressors
            return "SVR_Ridge"
        
def extraFeatureCreature(X, sample_d = np.arange(1,11,1)):
    
    # Container of our artificial created new features
    A = []
    
    # Creating artificial new features with degrees defined in sample_d array
    for deg in sample_d:
        
        # No creating new feautre when degree is as same as our real Data
        if(deg != 1):
            X = np.concatenate((X, np.power(X, deg)))
        
        # Appending new model with new degree
        A.append(X)
    
def twoPlotsInOne(X1,y1,y2,X2= None, \
                  c1= 'red',c2= 'blue', \
                  X_label= None, y_label= None, \
                  first_legend= None, second_legend= None):
    
    # Defining size of the plot
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_axes([0,0,1,1])
    
    # Plotting data's 
    line1, = ax.plot( X1 ,y1,c=c1)
    line2, = ax.plot( X1 if X2 == None else X2 ,y2,c=c2)
    
    # Adding 'x' axis label
    ax.set_xlabel(X_label)
    
    # Adding 'y' axis label
    ax.set_ylabel(y_label)
    
    # Adding Legends to our plot
    ax.legend((line1,line2),(first_legend,second_legend))

In [ ]:
findBestRegressorModel(X,y)

In [ ]: