Import important files



In [9]:

    
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt



In [10]:

    
# Showing matplotlib plots in jupyter notebook
%matplotlib inline

Notes Before last version



In [11]:

    
# Using patientId for better intitution
# Drawing polar heatmap
# Learning curve with smaller numbers
# How we can use SVR for Regression
# Making module for finding best functino of sklearn

Read Data



In [12]:

    
# Getting Training dataset
# Import dataset as Dataframe
df_full = pd.read_csv("../Dataset/slice_localization_data.csv", sep=',')

# Import dataset as numpy array
X_full = np.genfromtxt("../Dataset/slice_localization_data.csv", delimiter=',', skip_header=1)

# Making distnict output column for later uses
y = X_full[:,385]

# Removing first and last column from the dataset
X = X_full[:,1:385]

# Converting X to Dataframe
df = pd.DataFrame(X)
df.columns = df_full.columns[1:385]

# Getting size of the Training Dataset
m,n = X.shape



In [13]:

    
# Getting shape of the Dataset
X_full.shape









    Out[13]:





(53500, 386)



In [14]:

    
# Getting first 10 rows of the numpy array
X_full[0:10]









    Out[14]:





array([[  0.      ,   0.      ,   0.      , ...,  -0.25    ,  -0.25    ,
         21.803851],
       [  0.      ,   0.      ,   0.      , ...,  -0.25    ,  -0.25    ,
         21.745726],
       [  0.      ,   0.      ,   0.      , ...,  -0.25    ,  -0.25    ,
         21.6876  ],
       ..., 
       [  0.      ,   0.      ,   0.      , ...,  -0.25    ,  -0.25    ,
         21.396971],
       [  0.      ,   0.      ,   0.      , ...,  -0.25    ,  -0.25    ,
         21.28072 ],
       [  0.      ,   0.      ,   0.      , ...,  -0.25    ,  -0.25    ,
         22.617612]])



In [15]:

    
# Getting first 10 rows of the dataframe
df_full.head(10)









    Out[15]:







  
    
      
      patientId
      value0
      value1
      value2
      value3
      value4
      value5
      value6
      value7
      value8
      ...
      value375
      value376
      value377
      value378
      value379
      value380
      value381
      value382
      value383
      reference
    
  
  
    
      0
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      -0.25
      -0.25
      -0.25
      ...
      -0.25
      0.980381
      0.0
      0.0
      0.0
      0.0
      0.0
      -0.25
      -0.25
      21.803851
    
    
      1
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      -0.25
      -0.25
      -0.25
      ...
      -0.25
      0.977008
      0.0
      0.0
      0.0
      0.0
      0.0
      -0.25
      -0.25
      21.745726
    
    
      2
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      -0.25
      -0.25
      -0.25
      ...
      -0.25
      0.977008
      0.0
      0.0
      0.0
      0.0
      0.0
      -0.25
      -0.25
      21.687600
    
    
      3
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      -0.25
      -0.25
      -0.25
      ...
      -0.25
      0.977008
      0.0
      0.0
      0.0
      0.0
      0.0
      -0.25
      -0.25
      21.629474
    
    
      4
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      -0.25
      -0.25
      -0.25
      ...
      -0.25
      0.976833
      0.0
      0.0
      0.0
      0.0
      0.0
      -0.25
      -0.25
      21.571348
    
    
      5
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      -0.25
      -0.25
      -0.25
      ...
      -0.25
      0.953202
      0.0
      0.0
      0.0
      0.0
      0.0
      -0.25
      -0.25
      21.513223
    
    
      6
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      -0.25
      -0.25
      -0.25
      ...
      -0.25
      0.000000
      0.0
      0.0
      0.0
      0.0
      0.0
      -0.25
      -0.25
      21.455097
    
    
      7
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      -0.25
      -0.25
      -0.25
      ...
      -0.25
      0.867572
      0.0
      0.0
      0.0
      0.0
      0.0
      -0.25
      -0.25
      21.396971
    
    
      8
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      -0.25
      -0.25
      -0.25
      ...
      -0.25
      0.930170
      0.0
      0.0
      0.0
      0.0
      0.0
      -0.25
      -0.25
      21.280720
    
    
      9
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      -0.25
      -0.25
      -0.25
      ...
      -0.25
      0.990034
      0.0
      0.0
      0.0
      0.0
      0.0
      -0.25
      -0.25
      22.617612
    
  

10 rows × 386 columns



In [16]:

    
# Getting some intuition of data
df_full.describe()









    Out[16]:







  
    
      
      patientId
      value0
      value1
      value2
      value3
      value4
      value5
      value6
      value7
      value8
      ...
      value375
      value376
      value377
      value378
      value379
      value380
      value381
      value382
      value383
      reference
    
  
  
    
      count
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      ...
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
    
    
      mean
      47.075701
      0.059627
      0.071558
      0.145819
      0.218728
      0.274762
      0.276189
      0.204531
      0.062281
      -0.042025
      ...
      -0.029404
      0.182913
      0.320112
      0.359373
      0.342889
      0.266091
      0.083049
      -0.031146
      -0.154524
      47.028039
    
    
      std
      27.414240
      0.174243
      0.196921
      0.300270
      0.359163
      0.378862
      0.369605
      0.351294
      0.292232
      0.268391
      ...
      0.085817
      0.383333
      0.463517
      0.478188
      0.471811
      0.437633
      0.279734
      0.098738
      0.122491
      22.347042
    
    
      min
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      ...
      -0.250000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      -0.250000
      -0.250000
      -0.250000
      1.738733
    
    
      25%
      23.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      -0.250000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      -0.250000
      29.891607
    
    
      50%
      46.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      -0.250000
      43.987893
    
    
      75%
      70.000000
      0.000000
      0.000000
      0.000000
      0.446429
      0.684477
      0.662382
      0.441412
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.996286
      0.999677
      0.999560
      0.949478
      0.000000
      0.000000
      0.000000
      63.735059
    
    
      max
      96.000000
      1.000000
      1.000000
      1.000000
      1.000000
      0.998790
      0.996468
      0.999334
      1.000000
      1.000000
      ...
      0.961279
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      0.999857
      0.996839
      0.942851
      97.489115
    
  

8 rows × 386 columns



In [17]:

    
# Gettign unqiue values of the "Patient ID"
np.unique(df_full["patientId"])









    Out[17]:





array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96], dtype=int64)

Functions Section



In [18]:

    
# Import mean_squared_error function from Sklearn Library
from sklearn.metrics import mean_squared_error

def plotLearningCurves(X,y,step):
    
    m,n = X.shape
    maxVal = (int)(m / 10) * 10
    N_size_arr = np.arange(10, maxVal + 10, step)
    error_arr = np.zeros(( len(np.arange(10, maxVal + 10, step)) ,2 ))
    index = 0
    
    # Increasing train dataset size, "step" times in each iteration
    for i in N_size_arr:
        
        # Splitting Training dataset with size i into train and cross validation sets
        X_train, X_test, y_train, y_test = train_test_split(X[:i,:], y[:i], test_size=0.33, random_state=42)
        
        # Fitting Model
        lm.fit(X_train, y_train)
        
        # Computing both mean squared error of training dataset and cross validation datasets predections
        error_arr[index,0] = mean_squared_error(y_train , lm.predict(X_train))
        error_arr[index,1] = mean_squared_error(y_test, lm.predict(X_test))
        
        # Increasing index with 1
        index += 1
    
    # Initializing the figure
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_axes([0,0,1,1])
    ax.set_yscale('log')
    
    # Plotting "Training set size" vs. "Mean Squared Error" for both of the train and cross validation dataset's errors
    line1, = ax.plot(N_size_arr,error_arr[:,0], c='red')
    line2, = ax.plot(N_size_arr,error_arr[:,1], c='blue')
    
    # Adding labels && legends to our plot
    ax.set_xlabel("N (Training set size)")
    ax.set_ylabel("Mean Squared Error")
    
    ax.legend((line1,line2),("Train Error","Test Error"))

Data analysis



In [19]:

    
# Plot "Means of each column in dataset( Attributes ) except first and last column"  distplot
sns.distplot(df.mean())

# ===> We can conclude all of our attributes are in [-1,1] range, so we don't need to use feature normalize technique









    Out[19]:





<matplotlib.axes._subplots.AxesSubplot at 0x853ff90d30>



In [20]:

    
fig = plt.figure(figsize=(12,10))

axes1 = fig.add_axes([0, 2, 1, 1], projection='polar')
axes2 = fig.add_axes([1, 2,1,1], projection='polar')
axes3 = fig.add_axes([0, 1, 1, 1], projection='polar')
axes4 = fig.add_axes([1, 1,1,1], projection='polar')

# Plotting first example of bone structure of the one person
axes1.plot(X[150,1:241], 'bo', ms=10)
axes1.set_xlabel("Bone Structure")

# Plotting first example of air inclusion of the one person
axes2.plot(X[150,241:386], 'ro', ms=10)
axes2.set_xlabel("Air Inclusion")

# Plotting second example of bone structure of the one person
axes3.plot(X[3541,1:241], 'bo', ms=10)
axes3.set_xlabel("Bone Structure")

# Plotting second example of air inclusion of the one person
axes4.plot(X[3541,241:386], 'ro', ms=10)
axes4.set_xlabel("Air Inclusion")









    Out[20]:





<matplotlib.text.Text at 0x85392e33c8>



In [21]:

    
# Plot "Reference" column distplot
plt.figure(figsize=(12,8))
sns.distplot(y, bins=100)

# ==> We can see that we don't have any image for locations of body with value bigger than 100, and majority of the
# images are taken with values in [25,40] range









    Out[21]:





<matplotlib.axes._subplots.AxesSubplot at 0x853bb86160>

Predection Section



In [22]:

    
X.shape









    Out[22]:





(53500, 384)



In [23]:

    
y.shape









    Out[23]:





(53500,)



In [24]:

    
# Using SkLearn Library "Linear Regression" Method
from sklearn.linear_model import LinearRegression

# Initialization Model
lm = LinearRegression()



In [25]:

    
# Splitting Data into Training and Cross Validation Datasets
# Using Sklearn library for splitting data
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)









    



C:\Users\Danietzio\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [26]:

    
# Fitting Model with Train dataset ( Columns except "PatientId", "Reference")
lm.fit(X_train,y_train)









    Out[26]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [27]:

    
# Model Thetas
lm.coef_









    Out[27]:





array([ -2.49139644e+00,  -8.69497759e-01,   9.24385440e-01,
         6.62012145e+00,   6.42405712e+00,   3.29487223e+00,
         8.78259130e-01,   3.23571134e+00,  -5.55753081e+00,
        -9.17715543e-01,  -1.69933517e+00,   7.63349941e-02,
         3.51426894e-01,   4.42502516e-01,  -2.99870138e-01,
         3.65831336e-01,   1.07190094e+00,   6.17675949e-01,
        -3.23179052e+00,   3.59100341e+00,   1.69590241e+00,
        -4.87804896e-01,  -2.08363544e+00,   1.31187654e+00,
        -7.91772202e-01,  -5.14274490e-01,  -7.51391761e-01,
        -9.40538410e+00,   1.10672289e+01,   3.89230047e+00,
         5.89338084e-01,   8.42582554e-01,  -8.38247461e-01,
         3.45091308e-01,   5.93980190e-01,  -3.47319519e+00,
        -4.27445811e+00,  -5.78937940e-01,  -5.57114137e+00,
         1.25405079e+00,  -3.80578414e+00,  -1.00349029e+00,
        -3.04525502e+00,   5.39976900e-02,  -2.39727413e+00,
        -1.92287396e+00,  -2.35967959e-01,   2.05902910e+00,
         2.40929366e+13,   7.64767555e+12,  -3.02840588e+00,
         7.46169872e-01,  -1.28493065e+00,  -1.21671433e+00,
        -3.12624409e-01,  -1.83113386e+00,   1.27002833e+00,
        -5.24887151e+00,  -7.31673243e+00,  -1.78773285e+12,
        -2.31417879e-01,   1.70438096e+00,  -3.75049629e-01,
        -2.95822088e+00,  -2.82389455e+00,  -2.15940207e+00,
        -2.76272857e+00,   4.92988949e+00,   4.65814689e+00,
        -1.56717591e+12,   1.21568880e+00,   2.01366810e+00,
         2.39857854e+00,   4.19334127e-01,   1.82824677e+00,
         1.65955812e+00,  -3.88685965e+00,  -8.39860206e+00,
        -2.40929366e+13,  -7.64767555e+12,   8.53463422e-01,
        -1.83045438e+00,  -1.42406102e+00,   5.02939469e-02,
        -1.58933153e-01,  -1.24758944e+00,  -5.02475387e-01,
         4.87069649e+00,   9.24792557e-01,  -6.32266424e+00,
         1.02663821e+00,  -1.68896754e+00,   1.14282888e+00,
        -6.38146996e-01,   8.44215733e-01,   5.08413705e-01,
        -1.78708864e+00,   9.85034978e+00,   1.89526049e+01,
         1.45151874e+01,  -1.02847512e+00,  -2.05870634e+00,
         1.64076954e+00,   2.00666033e-01,   7.20281244e-01,
         1.07582834e+00,   3.83371713e+00,   1.47878423e+00,
        -3.50697200e+00,   4.06742493e-01,  -3.75842269e+00,
        -3.34627399e-01,  -7.94629849e-01,   7.89308402e-01,
         5.05005881e+00,   3.00721873e+00,   1.75354535e+00,
         6.54587875e-02,  -6.07755595e+00,  -9.58380083e+00,
        -4.51227986e+00,   1.79619506e+00,   1.38762830e+00,
        -1.11675286e+00,  -1.95215528e-01,  -4.46430042e-01,
        -1.01762836e+00,  -4.90434385e-01,  -2.46614388e+00,
         2.66611610e+00,  -1.37647967e+00,   9.19185319e-01,
         4.12840641e+00,  -1.93961937e+00,   9.62455171e-02,
        -3.44545716e+00,  -1.91869019e+00,  -4.04060392e+00,
        -1.89344418e+00,  -2.34833230e+00,   2.43700677e-01,
        -6.81411001e-01,   1.76999030e+00,   2.05834954e+00,
         9.47026006e-01,  -1.57526265e+00,  -1.50836248e+00,
         1.05008774e+00,   1.20334452e+00,  -7.36290933e-01,
        -2.48729200e+00,  -2.17372204e-01,  -1.72407074e+00,
         1.75942564e+00,  -4.29781897e-01,   4.22466593e-01,
        -1.76424428e+00,   5.70585784e-02,   1.68115256e+00,
         2.38964324e+00,  -5.48876505e-01,   5.09385654e-01,
        -5.28876561e-03,  -1.26062911e+00,   1.03249402e-01,
        -3.27606799e-01,   1.20136955e+00,  -2.37390600e+00,
         1.99007954e+00,  -1.79947697e+12,  -2.51057925e+00,
         5.39038268e-03,  -2.29575901e-01,  -5.40080274e-01,
        -2.43405514e+00,   1.54854318e+00,  -3.82061390e+00,
        -5.67866499e-01,  -1.52594609e+11,  -1.86097702e+11,
        -1.88487776e+00,  -3.12075466e-01,   2.28495997e+00,
         1.45810689e+00,  -1.00238233e-01,  -1.82567682e+00,
        -1.03564647e-01,  -4.01340042e+00,   1.52594609e+11,
        -1.59672034e+11,  -1.71662085e-01,   2.68049324e+00,
        -1.58635237e-01,  -1.06797034e+00,   3.87919947e-01,
        -1.90020933e+00,  -1.38165666e+00,   1.83603724e+00,
        -9.90663673e-01,   1.79947697e+12,  -8.31048668e-01,
        -4.49540231e-01,  -1.51146070e+00,  -1.28526709e+00,
        -2.59604957e-01,   2.85563774e-01,  -1.69227694e+00,
         1.55269077e+00,   4.56701133e-01,   3.76849632e-01,
        -1.91458902e+00,  -2.92021795e+00,   2.30222746e+00,
         1.22653657e+00,   7.59727889e-01,  -1.86165009e+00,
        -2.96483101e-01,  -2.43002973e+00,  -3.37867719e+00,
         1.40224303e-01,  -1.82232050e+00,   1.05350968e-01,
         3.57860806e+00,  -9.49098202e-01,  -1.98310114e+00,
        -3.39329335e+00,  -2.78765940e+00,  -3.25634706e+00,
        -3.30424687e+00,   4.19654650e+00,  -2.58865917e+00,
         6.68237559e-01,  -2.17007799e+00,   4.64827510e-01,
        -6.60295172e-01,   5.13933061e-01,  -1.20241829e+00,
        -3.52026591e+00,   1.96907532e+00,   1.08473219e+00,
         3.21247668e-01,   2.98323206e-01,  -2.53328599e+00,
         4.41584666e-02,  -6.94276797e-01,  -1.79061197e-02,
         1.69862841e+01,  -1.20302516e+01,  -7.91865986e-01,
         1.15307801e+00,  -5.60941443e-01,   1.82852235e+00,
         1.45005318e+00,   7.15954274e-01,  -1.91101345e+01,
         2.70094907e-01,   2.80762452e-02,  -1.03779321e+00,
        -2.64925205e-02,   3.14016966e-01,   3.00345634e-01,
        -2.63706836e-01,   5.15326447e+00,   1.11614771e+00,
        -1.70415347e+00,  -7.59155240e-01,   1.52691429e+00,
         2.29132262e-01,   1.16583636e+00,   1.95544752e+00,
        -3.54843662e+00,   1.15491786e+12,  -9.34081205e-01,
        -4.97381602e+00,  -2.02276697e+00,  -7.59189901e-01,
         2.52376438e-01,  -3.71735077e+00,   2.78257244e+00,
        -9.78950961e+00,  -9.89492103e-01,  -1.28198642e+00,
         4.75202494e-01,   7.94580733e-01,  -2.55015760e-01,
         1.89247225e-01,   4.06032824e+00,  -1.15491786e+12,
         1.88293172e+00,   8.78387911e-01,  -3.95531286e-02,
         1.07337961e+00,   1.29379368e-01,   2.16743534e+00,
         6.45827481e+00,   1.50418650e+00,   9.96240341e-01,
        -1.39106242e+00,  -3.83620675e-01,   2.15893836e-01,
         1.23155795e+00,   5.84703457e-02,  -2.45713237e+00,
        -3.74744777e+00,  -1.74112312e-01,  -1.84108023e-01,
         8.41178657e-01,   1.56226816e+00,  -4.72601244e-02,
        -6.37956140e-02,   8.20889187e+00,   9.20855720e+00,
        -1.87929198e-01,  -1.08199184e+00,  -2.18851157e+00,
         1.50152084e+00,   1.04881007e+00,   1.25676152e+00,
         8.00823186e-01,   4.54454666e+00,  -6.16253690e-01,
         1.23379083e+00,  -1.06513798e+00,  -2.82781086e+00,
         8.52455747e-01,  -6.12104802e-01,   1.23602662e+00,
        -3.77366610e+00,  -6.06277656e-01,  -1.01648290e+00,
        -6.57410728e-01,   5.15356659e-02,  -8.37834528e-02,
         9.33323282e-01,   9.75698977e-01,   1.60999119e+01,
        -6.76887026e-01,   3.83724313e-02,  -9.73070573e-01,
        -1.53679424e+00,   2.25893447e+00,   5.70659997e+00,
        -5.76431091e+00,  -6.76112093e+11,   9.39291231e-01,
         1.05380389e-01,   1.09836955e-01,   2.68028740e+00,
         1.15890375e-01,   2.16729178e+00,   3.56678901e-01,
         1.66181605e+09,   2.63546480e-01,  -3.98429128e-01,
        -1.62373434e-01,  -4.31590128e-01,   7.76861210e-01,
         3.67767608e-01,  -4.93437066e+00,   6.76112093e+11,
        -1.74274564e-01,   5.48986200e-01,  -3.43278254e+00,
        -1.41548161e+00,   1.91442591e+00,   8.33643186e-02,
        -2.72351738e+00,  -1.86789723e+01,  -4.71399312e-01,
        -1.46130203e-01,  -1.03484869e+00,  -1.07678057e+00,
         8.70736585e-01,  -8.20396077e-01,  -5.71111702e+00,
         2.41724076e+00,   1.24450984e-01,  -3.63836904e-01,
        -1.58250963e+00,   1.16640127e+00,   7.09022751e-01,
         1.81274474e+00,   4.05353730e+00,   5.36275010e+00])



In [28]:

    
# Predecting Reference values with Cross validation dataset
pred = lm.predict(X_test)

# Evaluating error with squared error method without using SkLearn Library
eval_err = np.sum( ( pred - y_test ) ** 2 , axis=0 ) / len(pred)
eval_err









    Out[28]:





68.254364896165498



In [29]:

    
# Plot predections vs. y_test for better understanding how our model works!
fig = plt.figure(figsize=(12,8))
ax = fig.add_axes([0,0,1,1])

ax.set_xlabel("Predections")
ax.set_ylabel("Test Target Varible")
ax.plot(y_test, pred,'bo',ms=1)









    Out[29]:





[<matplotlib.lines.Line2D at 0x853c0af390>]



In [30]:

    
plotLearningCurves(X,y,1000)

Trying for getting better predections



In [31]:

    
# Checking error on Train set value and Comparing it with Test dataset error
pred_train = lm.predict(X_train)
mean_squared_error(pred_train,y_train)

# ===> We can see both of the MSE on Training Dataset && Cross validation Dataset is high, so maybe algorithm suffers from High bias









    Out[31]:





1.5484293731660227e+18



In [32]:

    
# Normalize features but maybe it doesn't help, but it can help for decreasing time of the fitting algorithm
lm.normalize = True
lm.fit(X_train,y_train)









    Out[32]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)



In [33]:

    
pred_train = lm.predict(X_train)
mean_squared_error(pred_train, y_train)

# ===> We can see normalizing doesn't help









    Out[33]:





67.796342478357161



In [34]:

    
# Using other regression model of sklearn
from sklearn.linear_model import SGDRegressor

sg = SGDRegressor(alpha=0,max_iter=10000,n_iter=10000,learning_rate='constant',eta0=0.0001)









    



C:\Users\Danietzio\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\linear_model\stochastic_gradient.py:73: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.
  DeprecationWarning)



In [35]:

    
# Fitting Model
sg.fit(X_train,y_train)









    Out[35]:





SGDRegressor(alpha=0, average=False, epsilon=0.1, eta0=0.0001,
       fit_intercept=True, l1_ratio=0.15, learning_rate='constant',
       loss='squared_loss', max_iter=10000, n_iter=None, penalty='l2',
       power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)



In [36]:

    
# Predecting Using new model ( SGDRegressor )
pred = sg.predict(X_test)



In [37]:

    
# Evaluating model
mean_squared_error(y_test,pred)









    Out[37]:





68.293097049145004



In [38]:

    
# Plot predections vs. y_test for better understanding how our model works!
fig = plt.figure(figsize=(12,8))
ax = fig.add_axes([0,0,1,1])

ax.set_xlabel("Predections")
ax.set_ylabel("Test Target Varible")
ax.plot(y_test, pred,'bo',ms=1)









    Out[38]:





[<matplotlib.lines.Line2D at 0x853ee22320>]



In [ ]:

    
# Because train Error && test Error are very close to each other and they are almost high so we can conclude we are suffering \
# from high bias, and we should add extra features

# In the last try, We saw that more iterations doesn't help so much \
# So we try more features

# Function for finding best degree and alplha
# Before running this function we should import following functions
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn import svm

# Version 1.0.0
def findBestRegressorModel(X,y,sample_d= (1,10) ,sample_alpha= None, _kernel= 'rbf'):
    
    # sample_d should be tuple with size of 2
    if( type(sample_d) == tuple ):
        if( len(sample_d) != 2):
            raise ValueError("sample_d length should be 2 !!!")
        else:
            sample_d = np.arange(sample_d[0], sample_d[1] + 1, 1)
    else:
        raise ValueError("sample_d should be tuple !!!")
    
    # Finding which trainer model is better to use
    trainer = findBestTrainer(X)
    
    # Defining default value of "lambda of regularization part in loss function ( alpha )"
    if( sample_alpha == None ):
        sample_alpha = np.array([0.0001,0.001,0.003,0.01,0.03,0.1,0.3,1,3,10])
    
    # Container of the model error in each degree ( We'll use last two column for holding lambda and degree values \
    # in each iteration )
    error_tracker = np.zeros(( len(sample_d), 4))
        
    # Spliting data into Train, Cross Validation, Test dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # Creating artificial new features with degrees defined in sample_d array
    # Container of our artificial created new features
    A_train = extraFeatureCreature(X_train.copy(), sample_d)
    
    for deg in sample_d:
        
        # Spliting data into Train, Cross Validation dataset
        X_train, X_cross, y_train, y_cross = train_test_split(A_train[deg - 1], y, test_size=0.2)
        
        if( trainer == "SGDRegressor" ):
            
            # It's better to use SGDRegressor
            for lam in sample_alpha:
                sg = SGDRegressor(alpha=lam, max_iter=1000, n_iter=1000)
                sg.fit(X_train,y_train)
                
                pred_train = sg.predict(X_train)
                pred_cross = sg.predict(X_cross)
                
                MSE_train = mean.squared_error(pred_train, y_train)
                MSE_cross = mean_squared_error(pred_cross, y_cross)
                
                error_tracker[deg - 1][0] = MSE_train
                error_tracker[deg - 1][1] = MSE_cross
                error_tracker[deg - 1][2] = lam
                error_tracker[deg - 1][3] = deg
                
        elif ( trainer == "SVR_Ridge" ):
            
            # It's better to use "Linear SVR Regressor" or "rbf SVR Regressor"
            # We use "rbf" kernel because it usually give better predections, but obviously since it's more complicated model \
            # it takes much more time
            
            # Initializing SVR Model
            clf = svm.SVR(kernel= _kernel, max_iter=1000)
            
            # Fitting model with training set
            clf.fit(X_train,y_train)
            
            # Predecting outputs
            pred_train = clf.predict(X_train)
            pred_cross = clf.predict(X_cross)
            
            # Evaluating model
            MSE_train = mean.squared_error(pred_train, y_train)
            MSE_cross = mean_squared_error(pred_cross, y_cross)

            # Tracking error in each step
            # Note: Because we don't use any lambda in SVR model, so we don't need to add any \
            # values to third column of the error_tracker array
            error_tracker[deg - 1][0] = MSE_train
            error_tracker[deg - 1][1] = MSE_cross
            error_tracker[deg - 1][3] = deg
        
        else:
            
            # In later Versions of the module, we will use one of the \
            # SkLearn "Lassor" or "ElasticNet" Regressors if prior conditions are passed
            # but for now, we give up this with informing user to use one of the "Lasso" or "ElasticNet" Regressors
            print("It's better to use 'Lasso' or 'ElasticNet' Regressors!")
            
    # Creating extra features for test dataset
    A_test = extraFeatureCreature(X_test.copy(), sample_d)
        
    # Plot "Degree of model" vs. "Train and Test Error" for finding best possible model visually
    twoPlotsInOne(X1= error_tracker[:,3], \
                  y1= error_tracker[:,0], y2= error_tracker[:,1], \
                  X_label= "D (Degree of the model)", y_label= "Train && Test Error", \
                  first_legend= "Train Error", second_legend= "Test Error")
        
    # Checking which model has been used
    if( trainer == "SGDRegressor" ):
        
        # SkLearn SGDRegressor Model has been used
        # Finding best lambda and Degree
        best_lam = error_tracker[np.argmin(error_tracker[:,1])][2]
        best_deg = error_tracker[np.argmin(error_tracker[:,1])][3]

        sg = SGDRegressor(alpha=best_lam, max_iter=1000, n_iter=1000)
        sg.fit(A_train[best_deg - 1],y_train)
        pred = sg.predict(A_test[best_deg - 1])
        
    elif( trainer == "SVR_Ridge" ):
        
        # "SVR" or "Ridge" Regressors has been used !!!
        # Finding best Degree
        best_deg = error_tracker[np.argmin(error_tracker[:,1])][3]
        best_lam = None
        
        # Initializing SVR Model
        clf = svm.SVR(kernel= _kernel, max_iter=1000)
        clf.fit(A_train[best_deg - 1],y_train)
        pred = clf.predict(A_test[best_deg - 1])
        
    else:
        
        # "Lasso" or "ElasticNet" has been used !!!
        # In Later versions of this module, we will implement this section also \
        return "It's better to use 'Lasso' or 'ElasticNet' Regressors!"
        
    # Checking how much good do our model generalize !!
    test_error = mean_squared_error(y_test,pred)

    return { "Degree" : best_deg, "Lambda": best_lam, "Test Error": test_error }

def findBestTrainer(X):
    
    # This threshhold is inspired by Sklearn flow chart
    # Link for more information : http://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
    if( len(X) > 100000 ):
        return "SGDRegressor"
    else:
        
        # Checking how many features is important !!!
        # This proof is inspired by Sklearn flow chart also.
        # Link for more information : http://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
        # In later versions of this function we should make better logic for this.
        if(X.shape[1] < 100):
            
            # It's better to use SkLearn "Lasso" or "ElasticNet" Regressors
            return "Lasso_ElasticNet"
        else:
            
            # It's better to use SkLearn "SVR" or "RidgeRegressors" Regressors
            return "SVR_Ridge"
        
def extraFeatureCreature(X, sample_d = np.arange(1,11,1)):
    
    # Container of our artificial created new features
    A = []
    
    # Creating artificial new features with degrees defined in sample_d array
    for deg in sample_d:
        
        # No creating new feautre when degree is as same as our real Data
        if(deg != 1):
            X = np.concatenate((X, np.power(X, deg)))
        
        # Appending new model with new degree
        A.append(X)
    
def twoPlotsInOne(X1,y1,y2,X2= None, \
                  c1= 'red',c2= 'blue', \
                  X_label= None, y_label= None, \
                  first_legend= None, second_legend= None):
    
    # Defining size of the plot
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_axes([0,0,1,1])
    
    # Plotting data's 
    line1, = ax.plot( X1 ,y1,c=c1)
    line2, = ax.plot( X1 if X2 == None else X2 ,y2,c=c2)
    
    # Adding 'x' axis label
    ax.set_xlabel(X_label)
    
    # Adding 'y' axis label
    ax.set_ylabel(y_label)
    
    # Adding Legends to our plot
    ax.legend((line1,line2),(first_legend,second_legend))



In [ ]:

    
findBestRegressorModel(X,y)



In [ ]:

	value6	value7	value8	...	value375	value376	value382	value383	reference
0	-0.25	-0.25	-0.25	...	-0.25	0.980381	-0.25	-0.25	21.803851
1	-0.25	-0.25	-0.25	...	-0.25	0.977008	-0.25	-0.25	21.745726
2	-0.25	-0.25	-0.25	...	-0.25	0.977008	-0.25	-0.25	21.687600
3	-0.25	-0.25	-0.25	...	-0.25	0.977008	-0.25	-0.25	21.629474
4	-0.25	-0.25	-0.25	...	-0.25	0.976833	-0.25	-0.25	21.571348
5	-0.25	-0.25	-0.25	...	-0.25	0.953202	-0.25	-0.25	21.513223
6	-0.25	-0.25	-0.25	...	-0.25	0.000000	-0.25	-0.25	21.455097
7	-0.25	-0.25	-0.25	...	-0.25	0.867572	-0.25	-0.25	21.396971
8	-0.25	-0.25	-0.25	...	-0.25	0.930170	-0.25	-0.25	21.280720
9	-0.25	-0.25	-0.25	...	-0.25	0.990034	-0.25	-0.25	22.617612

	patientId	value0	value1	value2	value3	value4	value5	value6	value7	value8	...	value375	value376	value377	value378	value379	value380	value381	value382	value383	reference
count	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	...	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000
mean	47.075701	0.059627	0.071558	0.145819	0.218728	0.274762	0.276189	0.204531	0.062281	-0.042025	...	-0.029404	0.182913	0.320112	0.359373	0.342889	0.266091	0.083049	-0.031146	-0.154524	47.028039
std	27.414240	0.174243	0.196921	0.300270	0.359163	0.378862	0.369605	0.351294	0.292232	0.268391	...	0.085817	0.383333	0.463517	0.478188	0.471811	0.437633	0.279734	0.098738	0.122491	22.347042
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-0.250000	-0.250000	-0.250000	-0.250000	...	-0.250000	0.000000	0.000000	0.000000	0.000000	0.000000	-0.250000	-0.250000	-0.250000	1.738733
25%	23.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-0.250000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-0.250000	29.891607
50%	46.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-0.250000	43.987893
75%	70.000000	0.000000	0.000000	0.000000	0.446429	0.684477	0.662382	0.441412	0.000000	0.000000	...	0.000000	0.000000	0.996286	0.999677	0.999560	0.949478	0.000000	0.000000	0.000000	63.735059
max	96.000000	1.000000	1.000000	1.000000	1.000000	0.998790	0.996468	0.999334	1.000000	1.000000	...	0.961279	1.000000	1.000000	1.000000	1.000000	1.000000	0.999857	0.996839	0.942851	97.489115