You may need to do the following -

Impute missing data
Plot and visualize data to see any patterns

For the actual model, the submission Notebook should have the following -

Build Models using Logistics Regression and SVM (you will learn tonight - Wed).
Use Grid Search to evaluate model parameters (Wed Lab) and select a model
Build a Confusion Matrix (Mon Lab) to show how well your prediction did.

The homework is due by Monday Dec'15th Midnight. Upload your submission the same way as Homework 1.

Import stuff:



In [25]:

    
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_rows', 10)

Load the data set into a DataFrame and imputing missing values

labs/data/svm_data.csv



In [26]:

    
df = pd.read_csv('/Users/ChristopherRuiz/Documents/Education/GA-Data_Science/DAT_SF_11/homeworks/data/crx.data', header=None)
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
0     690 non-null object
1     690 non-null object
2     690 non-null float64
3     690 non-null object
4     690 non-null object
5     690 non-null object
6     690 non-null object
7     690 non-null float64
8     690 non-null object
9     690 non-null object
10    690 non-null int64
11    690 non-null object
12    690 non-null object
13    690 non-null object
14    690 non-null int64
15    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.6+ KB



In [27]:

    
df.describe()









    Out[27]:






  
    
      
      2
      7
      10
      14
    
  
  
    
      count
       690.000000
       690.000000
       690.00000
          690.000000
    
    
      mean
         4.758725
         2.223406
         2.40000
         1017.385507
    
    
      std
         4.978163
         3.346513
         4.86294
         5210.102598
    
    
      min
         0.000000
         0.000000
         0.00000
            0.000000
    
    
      25%
         1.000000
         0.165000
         0.00000
            0.000000
    
    
      50%
         2.750000
         1.000000
         0.00000
            5.000000
    
    
      75%
         7.207500
         2.625000
         3.00000
          395.500000
    
    
      max
        28.000000
        28.500000
        67.00000
       100000.000000



In [28]:

    
# df[4]
# columnX = df[2]
# columnX[pd.isnull(columnX)]
# df[1].unique()

# there are no NaN.  All columns have some sort of entry.
# Found that the entry was a '?' ... Using numpy to replace ? with NaN so that we can calculate the mean for the column
# NaN in Np skips over missing values
# Only col1 and 13 have ? entered. All other float/int columns have values

df[1] = df[1].replace('?', np.nan)
df[13] = df[13].replace('?', np.nan)



In [29]:

    
# converting column 1 series to float. 
df[1] = df[1].astype(float)
df[13] = df[13].astype(float)



In [30]:

    
# checking for null values
df[df[1].isnull()]









    Out[30]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
    
  
  
    
      83 
       a
      NaN
        3.500
       u
       g
        d
        v
       3.000
       t
       f
       0
       t
       g
       300
          0
       -
    
    
      86 
       b
      NaN
        0.375
       u
       g
        d
        v
       0.875
       t
       f
       0
       t
       s
       928
          0
       -
    
    
      92 
       b
      NaN
        5.000
       y
       p
       aa
        v
       8.500
       t
       f
       0
       f
       g
         0
          0
       -
    
    
      97 
       b
      NaN
        0.500
       u
       g
        c
       bb
       0.835
       t
       f
       0
       t
       s
       320
          0
       -
    
    
      254
       b
      NaN
        0.625
       u
       g
        k
        v
       0.250
       f
       f
       0
       f
       g
       380
       2010
       -
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      445
       a
      NaN
       11.250
       u
       g
       ff
       ff
       0.000
       f
       f
       0
       f
       g
       NaN
       5200
       -
    
    
      450
       b
      NaN
        3.000
       y
       p
        i
       bb
       7.000
       f
       f
       0
       f
       g
         0
          1
       -
    
    
      500
       b
      NaN
        4.000
       u
       g
        x
        v
       5.000
       t
       t
       3
       t
       g
       290
       2279
       +
    
    
      515
       b
      NaN
       10.500
       u
       g
        x
        v
       6.500
       t
       f
       0
       f
       g
         0
          0
       +
    
    
      608
       b
      NaN
        0.040
       y
       p
        d
        v
       4.250
       f
       f
       0
       t
       g
       460
          0
       -
    
  

12 rows × 16 columns



In [31]:

    
print 'Mean Col1:', df[1].mean()
print 'Std Col1:', df[1].std()
print ' '
print 'Mean Col13:', df[13].mean()
print 'Std Col13:', df[13].std()









    



Mean Col1: 31.5681710914
Std Col1: 11.9578624983
 
Mean Col13: 184.014771049
Std Col13: 173.806768225



In [32]:

    
# Create a Normal Distribution centered on Mean of 31.5681710914 and Standard Dev of 11.9578624983
# Get 117 Entries since that's how many missing entries we have for Age
def get_ColOne_impute_values(n):
    return np.random.normal(31.5681710914, 11.9578624983, n)

def get_ColThirteen_impute_values(n):
    return np.random.normal(184.014771049, 173.806768225, n)



In [33]:

    
dfNullValuesColOne = df[1].isnull()
df[dfNullValuesColOne]









    Out[33]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
    
  
  
    
      83 
       a
      NaN
        3.500
       u
       g
        d
        v
       3.000
       t
       f
       0
       t
       g
       300
          0
       -
    
    
      86 
       b
      NaN
        0.375
       u
       g
        d
        v
       0.875
       t
       f
       0
       t
       s
       928
          0
       -
    
    
      92 
       b
      NaN
        5.000
       y
       p
       aa
        v
       8.500
       t
       f
       0
       f
       g
         0
          0
       -
    
    
      97 
       b
      NaN
        0.500
       u
       g
        c
       bb
       0.835
       t
       f
       0
       t
       s
       320
          0
       -
    
    
      254
       b
      NaN
        0.625
       u
       g
        k
        v
       0.250
       f
       f
       0
       f
       g
       380
       2010
       -
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      445
       a
      NaN
       11.250
       u
       g
       ff
       ff
       0.000
       f
       f
       0
       f
       g
       NaN
       5200
       -
    
    
      450
       b
      NaN
        3.000
       y
       p
        i
       bb
       7.000
       f
       f
       0
       f
       g
         0
          1
       -
    
    
      500
       b
      NaN
        4.000
       u
       g
        x
        v
       5.000
       t
       t
       3
       t
       g
       290
       2279
       +
    
    
      515
       b
      NaN
       10.500
       u
       g
        x
        v
       6.500
       t
       f
       0
       f
       g
         0
          0
       +
    
    
      608
       b
      NaN
        0.040
       y
       p
        d
        v
       4.250
       f
       f
       0
       t
       g
       460
          0
       -
    
  

12 rows × 16 columns



In [34]:

    
df.loc[df[1].isnull(), 1] = get_ColOne_impute_values(n=12)



In [35]:

    
df[dfNullValuesColOne]









    Out[35]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
    
  
  
    
      83 
       a
       22.735150
        3.500
       u
       g
        d
        v
       3.000
       t
       f
       0
       t
       g
       300
          0
       -
    
    
      86 
       b
       38.986139
        0.375
       u
       g
        d
        v
       0.875
       t
       f
       0
       t
       s
       928
          0
       -
    
    
      92 
       b
       25.945669
        5.000
       y
       p
       aa
        v
       8.500
       t
       f
       0
       f
       g
         0
          0
       -
    
    
      97 
       b
       35.904605
        0.500
       u
       g
        c
       bb
       0.835
       t
       f
       0
       t
       s
       320
          0
       -
    
    
      254
       b
       49.142239
        0.625
       u
       g
        k
        v
       0.250
       f
       f
       0
       f
       g
       380
       2010
       -
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      445
       a
       31.811619
       11.250
       u
       g
       ff
       ff
       0.000
       f
       f
       0
       f
       g
       NaN
       5200
       -
    
    
      450
       b
       44.710324
        3.000
       y
       p
        i
       bb
       7.000
       f
       f
       0
       f
       g
         0
          1
       -
    
    
      500
       b
       41.532193
        4.000
       u
       g
        x
        v
       5.000
       t
       t
       3
       t
       g
       290
       2279
       +
    
    
      515
       b
       27.462976
       10.500
       u
       g
        x
        v
       6.500
       t
       f
       0
       f
       g
         0
          0
       +
    
    
      608
       b
       28.564390
        0.040
       y
       p
        d
        v
       4.250
       f
       f
       0
       t
       g
       460
          0
       -
    
  

12 rows × 16 columns



In [36]:

    
dfNullValuesColThirteen = df[13].isnull()
df[dfNullValuesColThirteen]









    Out[36]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
    
  
  
    
      71 
       b
       34.830000
        4.000
       u
       g
        d
       bb
       12.500
       t
       f
       0
       t
       g
      NaN
           0
       -
    
    
      202
       b
       24.830000
        2.750
       u
       g
        c
        v
        2.250
       t
       t
       6
       f
       g
      NaN
         600
       +
    
    
      206
       a
       71.580000
        0.000
       ?
       ?
        ?
        ?
        0.000
       f
       f
       0
       f
       p
      NaN
           0
       +
    
    
      243
       a
       18.750000
        7.500
       u
       g
        q
        v
        2.710
       t
       t
       5
       f
       g
      NaN
       26726
       +
    
    
      270
       b
       37.580000
        0.000
       ?
       ?
        ?
        ?
        0.000
       f
       f
       0
       f
       p
      NaN
           0
       +
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      445
       a
       31.811619
       11.250
       u
       g
       ff
       ff
        0.000
       f
       f
       0
       f
       g
      NaN
        5200
       -
    
    
      456
       b
       34.580000
        0.000
       ?
       ?
        ?
        ?
        0.000
       f
       f
       0
       f
       p
      NaN
           0
       -
    
    
      592
       b
       23.170000
        0.000
       ?
       ?
        ?
        ?
        0.000
       f
       f
       0
       f
       p
      NaN
           0
       +
    
    
      622
       a
       25.580000
        0.000
       ?
       ?
        ?
        ?
        0.000
       f
       f
       0
       f
       p
      NaN
           0
       +
    
    
      626
       b
       22.000000
        7.835
       y
       p
        i
       bb
        0.165
       f
       f
       0
       t
       g
      NaN
           0
       -
    
  

13 rows × 16 columns



In [37]:

    
df.loc[df[13].isnull(), 13] = get_ColThirteen_impute_values(n=13)
df[dfNullValuesColThirteen]









    Out[37]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
    
  
  
    
      71 
       b
       34.830000
        4.000
       u
       g
        d
       bb
       12.500
       t
       f
       0
       t
       g
       237.246967
           0
       -
    
    
      202
       b
       24.830000
        2.750
       u
       g
        c
        v
        2.250
       t
       t
       6
       f
       g
       343.892755
         600
       +
    
    
      206
       a
       71.580000
        0.000
       ?
       ?
        ?
        ?
        0.000
       f
       f
       0
       f
       p
       265.364324
           0
       +
    
    
      243
       a
       18.750000
        7.500
       u
       g
        q
        v
        2.710
       t
       t
       5
       f
       g
       199.471986
       26726
       +
    
    
      270
       b
       37.580000
        0.000
       ?
       ?
        ?
        ?
        0.000
       f
       f
       0
       f
       p
       -85.958304
           0
       +
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      445
       a
       31.811619
       11.250
       u
       g
       ff
       ff
        0.000
       f
       f
       0
       f
       g
       191.004194
        5200
       -
    
    
      456
       b
       34.580000
        0.000
       ?
       ?
        ?
        ?
        0.000
       f
       f
       0
       f
       p
       193.936088
           0
       -
    
    
      592
       b
       23.170000
        0.000
       ?
       ?
        ?
        ?
        0.000
       f
       f
       0
       f
       p
       216.012033
           0
       +
    
    
      622
       a
       25.580000
        0.000
       ?
       ?
        ?
        ?
        0.000
       f
       f
       0
       f
       p
       135.138498
           0
       +
    
    
      626
       b
       22.000000
        7.835
       y
       p
        i
       bb
        0.165
       f
       f
       0
       t
       g
       119.359794
           0
       -
    
  

13 rows × 16 columns



In [38]:

    
letters=['a','b']
np.random.choice(letters, 1)









    Out[38]:





array(['a'], 
      dtype='|S1')



In [39]:

    
def get_ColZero_impute_values(n):
    return np.random.choice('a','b', n)

# def get_ColZero_impute_values(n):
#     letters=['a','b']
#     np.random.choice(letters, n)



In [40]:

    
def A1_map(val):
    if val == 'b':
        return 1
    elif val == '?':
        return '?'
    else:
        return 0
df['A1_map'] = df[0].map(A1_map)

df.head(5)
df['A1_map'] = df['A1_map'].replace('?', np.nan)
df['A1_map'] = df['A1_map'].astype(float)



In [41]:

    
print 'Mean Col1:', df['A1_map'].mean()
print 'Std Col1:', df['A1_map'].std()









    



Mean Col1: 0.690265486726
Std Col1: 0.462725456585



In [42]:

    
def get_ColThirteen_impute_values(n):
    return np.random.normal(184.014771049, 173.806768225, n)



In [43]:

    
# df[df[0]=='?']



In [44]:

    
# dfNullValuesColZero = df[df[0]=='?']
# dfNullValuesColZero

# df.loc[df[df[0]=='?'],12 ] = get_ColZero_impute_values(n=12)
# df.loc[df[df[0]=='?'], 'Age'] = get_age_impute_values(n=177)
# df[dfNullValuesColThirteen]



In [45]:

    
# remove rows with ? in column 7. This takes care of the missing 
df = df[df[6] != '?']
df









    Out[45]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      A1_map
    
  
  
    
      0  
       b
       30.83
        0.000
       u
       g
        w
        v
       1.25
       t
       t
       1
       f
       g
       202
         0
       +
       1
    
    
      1  
       a
       58.67
        4.460
       u
       g
        q
        h
       3.04
       t
       t
       6
       f
       g
        43
       560
       +
       0
    
    
      2  
       a
       24.50
        0.500
       u
       g
        q
        h
       1.50
       t
       f
       0
       f
       g
       280
       824
       +
       0
    
    
      3  
       b
       27.83
        1.540
       u
       g
        w
        v
       3.75
       t
       t
       5
       t
       g
       100
         3
       +
       1
    
    
      4  
       b
       20.17
        5.625
       u
       g
        w
        v
       1.71
       t
       f
       0
       f
       s
       120
         0
       +
       1
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      685
       b
       21.08
       10.085
       y
       p
        e
        h
       1.25
       f
       f
       0
       f
       g
       260
         0
       -
       1
    
    
      686
       a
       22.67
        0.750
       u
       g
        c
        v
       2.00
       f
       t
       2
       t
       g
       200
       394
       -
       0
    
    
      687
       a
       25.25
       13.500
       y
       p
       ff
       ff
       2.00
       f
       t
       1
       t
       g
       200
         1
       -
       0
    
    
      688
       b
       17.92
        0.205
       u
       g
       aa
        v
       0.04
       f
       f
       0
       f
       g
       280
       750
       -
       1
    
    
      689
       b
       35.00
        3.375
       u
       g
        c
        h
       8.29
       f
       f
       0
       t
       g
         0
         0
       -
       1
    
  

681 rows × 17 columns



In [46]:

    
df[0].value_counts() / df[0].size
def impute_a1(val):
    return np.random.choice(['a', 'b'], p=[0.7, 0.3])
df[0] = df[0].map(impute_a1)
df[0].unique()









    Out[46]:





array(['a', 'b'], dtype=object)



In [47]:

    
df









    Out[47]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      A1_map
    
  
  
    
      0  
       a
       30.83
        0.000
       u
       g
        w
        v
       1.25
       t
       t
       1
       f
       g
       202
         0
       +
       1
    
    
      1  
       a
       58.67
        4.460
       u
       g
        q
        h
       3.04
       t
       t
       6
       f
       g
        43
       560
       +
       0
    
    
      2  
       a
       24.50
        0.500
       u
       g
        q
        h
       1.50
       t
       f
       0
       f
       g
       280
       824
       +
       0
    
    
      3  
       b
       27.83
        1.540
       u
       g
        w
        v
       3.75
       t
       t
       5
       t
       g
       100
         3
       +
       1
    
    
      4  
       a
       20.17
        5.625
       u
       g
        w
        v
       1.71
       t
       f
       0
       f
       s
       120
         0
       +
       1
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      685
       a
       21.08
       10.085
       y
       p
        e
        h
       1.25
       f
       f
       0
       f
       g
       260
         0
       -
       1
    
    
      686
       b
       22.67
        0.750
       u
       g
        c
        v
       2.00
       f
       t
       2
       t
       g
       200
       394
       -
       0
    
    
      687
       b
       25.25
       13.500
       y
       p
       ff
       ff
       2.00
       f
       t
       1
       t
       g
       200
         1
       -
       0
    
    
      688
       b
       17.92
        0.205
       u
       g
       aa
        v
       0.04
       f
       f
       0
       f
       g
       280
       750
       -
       1
    
    
      689
       a
       35.00
        3.375
       u
       g
        c
        h
       8.29
       f
       f
       0
       t
       g
         0
         0
       -
       1
    
  

681 rows × 17 columns

Explore the data

Describe the data.
Plot the data, use a scatter plot and color each class with a different color



In [48]:

    
df[1].hist(by=df[5],bins=20, sharey=True)









    Out[48]:





array([[<matplotlib.axes.AxesSubplot object at 0x10e5ede50>,
        <matplotlib.axes.AxesSubplot object at 0x10e6ae610>,
        <matplotlib.axes.AxesSubplot object at 0x10e6d2350>,
        <matplotlib.axes.AxesSubplot object at 0x10e6f14d0>],
       [<matplotlib.axes.AxesSubplot object at 0x10e71ca90>,
        <matplotlib.axes.AxesSubplot object at 0x10e6d6110>,
        <matplotlib.axes.AxesSubplot object at 0x10e763e90>,
        <matplotlib.axes.AxesSubplot object at 0x10e790650>],
       [<matplotlib.axes.AxesSubplot object at 0x10e7aefd0>,
        <matplotlib.axes.AxesSubplot object at 0x10e7dbb90>,
        <matplotlib.axes.AxesSubplot object at 0x10e7c0b50>,
        <matplotlib.axes.AxesSubplot object at 0x10e826410>],
       [<matplotlib.axes.AxesSubplot object at 0x10e854350>,
        <matplotlib.axes.AxesSubplot object at 0x10e871f90>,
        <matplotlib.axes.AxesSubplot object at 0x10e89ae90>,
        <matplotlib.axes.AxesSubplot object at 0x10e8addd0>]], dtype=object)



In [49]:

    
df[1].hist(by=df[0],bins=20, sharey=True)









    Out[49]:





array([<matplotlib.axes.AxesSubplot object at 0x10e548ed0>,
       <matplotlib.axes.AxesSubplot object at 0x10ea69450>], dtype=object)



In [50]:

    
df[1].hist(bins=20)









    Out[50]:





<matplotlib.axes.AxesSubplot at 0x10ece7490>



In [51]:

    
# Rechecking data for reference
df.describe()









    Out[51]:






  
    
      
      1
      2
      7
      10
      13
      14
      A1_map
    
  
  
    
      count
       681.000000
       681.000000
       681.000000
       681.000000
        681.000000
          681.000000
       671.000000
    
    
      mean
        31.486823
         4.806990
         2.251872
         2.431718
        184.088781
         1030.330396
         0.690015
    
    
      std
        11.722445
         4.988754
         3.359283
         4.887119
        173.668616
         5243.226720
         0.462832
    
    
      min
        13.750000
         0.000000
         0.000000
         0.000000
        -77.553126
            0.000000
         0.000000
    
    
      25%
        22.670000
         1.000000
         0.165000
         0.000000
         75.000000
            0.000000
         0.000000
    
    
      50%
        28.500000
         2.875000
         1.000000
         0.000000
        160.000000
            5.000000
         1.000000
    
    
      75%
        38.250000
         7.500000
         2.750000
         3.000000
        276.000000
          400.000000
         1.000000
    
    
      max
        76.750000
        28.000000
        28.500000
        67.000000
       2000.000000
       100000.000000
         1.000000



In [52]:

    
# Rechecking data for reference
df.head(5)



In [53]:

    
# columnZeroCount = df.groupby([0])[15].agg(['count'])
# print columnZeroCount
# columnZeroCount.hist()
# df[0].values
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 681 entries, 0 to 689
Data columns (total 17 columns):
0         681 non-null object
1         681 non-null float64
2         681 non-null float64
3         681 non-null object
4         681 non-null object
5         681 non-null object
6         681 non-null object
7         681 non-null float64
8         681 non-null object
9         681 non-null object
10        681 non-null int64
11        681 non-null object
12        681 non-null object
13        681 non-null float64
14        681 non-null int64
15        681 non-null object
A1_map    671 non-null float64
dtypes: float64(5), int64(2), object(10)
memory usage: 95.8+ KB



In [54]:

    
plt.scatter(df[1],df[7],s=70,alpha=.5)
plt.xlim(0)
plt.ylim(0)









    Out[54]:





(0, 30.0)



In [55]:

    
plt.scatter(df[1],df[14],s=70,alpha=.5)
plt.axis('tight')
# plt.xlim(0)
# plt.ylim(90)









    Out[55]:





(10.598521505376345,
 79.901478494623674,
 -5000.002217741936,
 105000.00221774193)



In [56]:

    
criteria = df[df[14]<950]
# criteria
plt.scatter(criteria[1],criteria[14],s=70,alpha=.5)
plt.axis('tight')









    Out[56]:





(12.089521505376343,
 79.830478494623662,
 -47.402217741935488,
 995.4022177419356)



In [57]:

    
criteria_Two = df[df[14]<150]
# criteria
plt.scatter(criteria_Two[1],criteria_Two[14],s=70,alpha=.5)
plt.axis('tight')









    Out[57]:





(12.089521505376343,
 79.830478494623662,
 -7.3522177419354842,
 154.35221774193548)



In [58]:

    
plt.scatter(df[1],df[2],s=70,alpha=.5)
plt.xlim(0)
plt.ylim(0)









    Out[58]:





(0, 30.0)



In [59]:

    
plt.scatter(df[7],df[10],s=70,alpha=.5)
plt.xlim(0)
plt.ylim(0)









    Out[59]:





(0, 80.0)



In [60]:

    
plt.scatter(df[10],df[14],s=70,alpha=.5)
# plt.xlim(0, 50)
# plt.ylim(0,50)
plt.xlim(0)
plt.ylim(0)









    Out[60]:





(0, 120000.0)



In [61]:

    
column_Zero = df.groupby(df[0]).agg('count')
column_Zero
# df.hist(by=df[0], sharey=True)



In [62]:

    
# for i in list(df.columns.values):
#     print i

pd.scatter_matrix(df[[0, 1, 2, 3,4,5,6,7,8,9,10,11,12,13,14,15]]);



In [63]:

    
from mpl_toolkits.mplot3d import Axes3D

ThreeD_Graph = plt.figure().gca(projection='3d')
ThreeD_Graph.scatter(df[1], df[2], df[13], s = 5)
ThreeD_Graph.set_xlabel(1)
ThreeD_Graph.set_ylabel(2)
ThreeD_Graph.set_zlabel(13)
plt.show()



In [64]:

    
box_Color = dict(boxes='DarkGreen', whiskers='DarkOrange',medians='DarkBlue', caps='Gray')
 

# df[1].plot(kind='box', color=box_Color, sym='r+')
# df[2].plot(kind='box', color=box_Color, sym='r+')
# df[7].plot(kind='box', color=box_Color, sym='r+')
df[13].plot(kind='box', color=box_Color, sym='r+')
# df[14].plot(kind='box', color=box_Color, sym='r+')









    Out[64]:





<matplotlib.axes.AxesSubplot at 0x10f43e290>



In [65]:

    
plt.scatter(df[1],df[2],c=df[13], s=70,alpha=.5)









    Out[65]:





<matplotlib.collections.PathCollection at 0x10f623910>

Split the data set

create a train/test split with .3 testing size



In [66]:

    
X_data = df[[1,2,13]]



In [67]:

    
X_data.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 681 entries, 0 to 689
Data columns (total 3 columns):
1     681 non-null float64
2     681 non-null float64
13    681 non-null float64
dtypes: float64(3)
memory usage: 21.3 KB



In [68]:

    
X_data = pd.get_dummies(X_data)



In [69]:

    
y_data = df[14]



In [70]:

    
X_data









    Out[70]:






  
    
      
      1
      2
      13
    
  
  
    
      0  
       30.83
        0.000
       202
    
    
      1  
       58.67
        4.460
        43
    
    
      2  
       24.50
        0.500
       280
    
    
      3  
       27.83
        1.540
       100
    
    
      4  
       20.17
        5.625
       120
    
    
      ...
      ...
      ...
      ...
    
    
      685
       21.08
       10.085
       260
    
    
      686
       22.67
        0.750
       200
    
    
      687
       25.25
       13.500
       200
    
    
      688
       17.92
        0.205
       280
    
    
      689
       35.00
        3.375
         0
    
  

681 rows × 3 columns



In [71]:

    
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report



In [72]:

    
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=12, test_size=0.2)



In [73]:

    
clf = LogisticRegression()
clf.fit(X_train, y_train)









    Out[73]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)



In [74]:

    
clf.score(X_test, y_test)









    Out[74]:





0.37956204379562042



In [75]:

    
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)









    Out[75]:





array([[52,  0,  0, ...,  0,  0,  0],
       [ 8,  0,  0, ...,  0,  0,  0],
       [ 2,  0,  0, ...,  0,  0,  0],
       ..., 
       [ 1,  0,  0, ...,  0,  0,  0],
       [ 1,  0,  0, ...,  0,  0,  0],
       [ 1,  0,  0, ...,  0,  0,  0]])



In [76]:

    
print classification_report(y_test, y_pred)

# Warning appeared
# /Library/Python/2.7/site-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
#   'precision', 'predicted', average, warn_for)









    



             precision    recall  f1-score   support

          0       0.38      1.00      0.55        52
          1       0.00      0.00      0.00         8
          2       0.00      0.00      0.00         2
          3       0.00      0.00      0.00         2
          4       0.00      0.00      0.00         2
          5       0.00      0.00      0.00         2
          6       0.00      0.00      0.00         2
         10       0.00      0.00      0.00         1
         12       0.00      0.00      0.00         1
         13       0.00      0.00      0.00         1
         15       0.00      0.00      0.00         1
         18       0.00      0.00      0.00         1
         19       0.00      0.00      0.00         1
         20       0.00      0.00      0.00         2
         28       0.00      0.00      0.00         1
         32       0.00      0.00      0.00         1
         35       0.00      0.00      0.00         1
         38       0.00      0.00      0.00         1
         40       0.00      0.00      0.00         1
         42       0.00      0.00      0.00         1
         44       0.00      0.00      0.00         1
         50       0.00      0.00      0.00         2
         58       0.00      0.00      0.00         1
         60       0.00      0.00      0.00         1
         68       0.00      0.00      0.00         1
         90       0.00      0.00      0.00         1
        100       0.00      0.00      0.00         1
        117       0.00      0.00      0.00         1
        122       0.00      0.00      0.00         1
        184       0.00      0.00      0.00         1
        196       0.00      0.00      0.00         1
        200       0.00      0.00      0.00         4
        221       0.00      0.00      0.00         1
        251       0.00      0.00      0.00         1
        284       0.00      0.00      0.00         1
        300       0.00      0.00      0.00         3
        390       0.00      0.00      0.00         1
        484       0.00      0.00      0.00         1
        500       0.00      0.00      0.00         3
        551       0.00      0.00      0.00         1
        567       0.00      0.00      0.00         1
        600       0.00      0.00      0.00         1
        639       0.00      0.00      0.00         1
        730       0.00      0.00      0.00         1
        769       0.00      0.00      0.00         1
        857       0.00      0.00      0.00         1
        990       0.00      0.00      0.00         1
       1000       0.00      0.00      0.00         2
       1208       0.00      0.00      0.00         1
       1260       0.00      0.00      0.00         1
       1270       0.00      0.00      0.00         1
       1442       0.00      0.00      0.00         1
       2028       0.00      0.00      0.00         1
       2206       0.00      0.00      0.00         1
       2732       0.00      0.00      0.00         1
       4000       0.00      0.00      0.00         1
       4071       0.00      0.00      0.00         1
       5200       0.00      0.00      0.00         1
       6590       0.00      0.00      0.00         1
       7059       0.00      0.00      0.00         1
       8851       0.00      0.00      0.00         1
      11202       0.00      0.00      0.00         1
      13212       0.00      0.00      0.00         1
      18027       0.00      0.00      0.00         1

avg / total       0.14      0.38      0.21       137







    



/Library/Python/2.7/site-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)



In [77]:

    
pd.DataFrame(zip(X_data.columns, np.transpose(clf.coef_)))









    Out[77]:






  
    
      
      0
      1
    
  
  
    
      0
        1
       [0.000349237189033, -0.0596830401286, -0.06050...
    
    
      1
        2
       [-0.0193477998056, -0.0828988876214, -0.159675...
    
    
      2
       13
       [0.000140609841843, -0.000249876112965, 0.0016...



In [78]:

    
y_pred_df = pd.DataFrame(clf.predict_proba(X_test))
y_pred_df.rename(columns={0: 'No', 1: 'Yes'}, inplace=True)
y_pred_df['y_pred'] = y_pred
y_pred_df['y_true'] = y_test
y_pred_df.tail()









    Out[78]:






  
    
      
      No
      Yes
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      194
      195
      196
      197
      198
      199
      200
      201
      y_pred
      y_true
    
  
  
    
      132
       0.876669
       0.013997
       0.004086
       0.001842
       0.005644
       0.000215
       0.008912
       0.000026
       0.000009
       0.000001
      ...
       0.000005
       1.365825e-05
       0.000003
       1.984458e-08
       0.000107
       0.000006
       0.000033
       2.471840e-16
       0
        58
    
    
      133
       0.356034
       0.049803
       0.025432
       0.009886
       0.004566
       0.013212
       0.018420
       0.007220
       0.005184
       0.002970
      ...
       0.005170
       4.237872e-12
       0.003936
       5.991805e-04
       0.002887
       0.004901
       0.004478
       6.214841e-07
       0
         4
    
    
      134
       0.694512
       0.054400
       0.032842
       0.004486
       0.001208
       0.007457
       0.018237
       0.003349
       0.001204
       0.000709
      ...
       0.001510
       7.041957e-18
       0.000913
       9.942237e-05
       0.000499
       0.001710
       0.002044
       1.282210e-08
       0
       122
    
    
      135
       0.647337
       0.046661
       0.014449
       0.009688
       0.007861
       0.007543
       0.019930
       0.003124
       0.000682
       0.000191
      ...
       0.000243
       4.044420e-08
       0.000201
       1.097291e-04
       0.000869
       0.000528
       0.002004
       2.225272e-08
       0
         0
    
    
      136
       0.480583
       0.057887
       0.026718
       0.007609
       0.001819
       0.019792
       0.018404
       0.013686
       0.002638
       0.001846
      ...
       0.001504
       1.688421e-15
       0.001214
       1.731271e-03
       0.000721
       0.003270
       0.006408
       9.910387e-06
       0
         0
    
  

5 rows × 204 columns



In [79]:

    
sns.regplot(y_pred, y_test, x_jitter=0.1, y_jitter=0.1)









    Out[79]:





<matplotlib.axes.AxesSubplot at 0x10f65d690>

Predict

Below is an SVM estimator, using `sklearn.svm.LinearSVC`

Identify the default parameters
What is C?
Identify the example



In [80]:

    
#initialize C=1e-3
# the documentation tells us the default value is 1
est = LinearSVC(C=1e-3)

Fit the model with the training data



In [81]:

    
est.fit(X_train, y_train)









    Out[81]:





LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)

Score our model using test data



In [82]:

    
est.score(X_test, y_test)









    Out[82]:





0.35036496350364965

Visualize our decision function

A little helper function to make your life easier (you're welcome):



In [83]:

    
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
# We generate a grid in the square [-3,3 ]^2.
xx, yy = np.meshgrid(np.linspace(-3, 3, 500),
                     np.linspace(-3, 3, 500))

# This function takes a SVM estimator as input.
# def plot_decision_function(fitted_est):
def plot_decision_function(fitted_est,X,y):
    # We evaluate the decision function on the grid.
    Z = fitted_est.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    cmap = plt.cm.coolwarm
    # We display the decision function on the grid.
    plt.figure(figsize=(5,5));
    plt.imshow(Z,
                extent=(xx.min(), xx.max(), yy.min(), yy.max()),
                aspect='auto', origin='lower', cmap=cmap);
    # We display the boundaries.
    plt.contour(xx, yy, Z, levels=[0], linewidths=2,
                colors='k');
    # We display the points with their true labels.
    plt.scatter(X[:, 0], X[:, 1], s=30, c=.5+.5*y, lw=1, 
                cmap=cmap, vmin=0, vmax=1);
    plt.axhline(0, color='k', ls='--');
    plt.axvline(0, color='k', ls='--');
    plt.xticks(());
    plt.yticks(());
    plt.axis([-3, 3, -3, 3]);

Fit, predict then plot the decision function several times. What do you observe?



In [84]:

    
est.fit(X_train, y_train)









    Out[84]:





LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)



In [85]:

    
plot_decision_function(est,df[[1,2]].values,df[[13]].values)
# plot_decision_function(est)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-85-495d0d4a5ce5> in <module>()
----> 1 plot_decision_function(est,df[[1,2]].values,df[[13]].values)
      2 # plot_decision_function(est)

<ipython-input-83-7f3a228981cf> in plot_decision_function(fitted_est, X, y)
     12 def plot_decision_function(fitted_est,X,y):
     13     # We evaluate the decision function on the grid.
---> 14     Z = fitted_est.decision_function(np.c_[xx.ravel(), yy.ravel()])
     15     Z = Z.reshape(xx.shape)
     16     cmap = plt.cm.coolwarm

/Library/Python/2.7/site-packages/sklearn/linear_model/base.pyc in decision_function(self, X)
    194         if X.shape[1] != n_features:
    195             raise ValueError("X has %d features per sample; expecting %d"
--> 196                              % (X.shape[1], n_features))
    197 
    198         scores = safe_sparse_dot(X, self.coef_.T,

ValueError: X has 2 features per sample; expecting 3

GridSearch

Create a parameter grid for `C` using `np.logspace(-3., 3., 10)`.

Read the example for grid search sklearn.grid_search.GridSearchCV.



In [ ]:

    
from sklearn.grid_search import GridSearchCV

# d = {'C':[0,2,1]}
# d['C'] = np.logspace(-3., 3., 10)
# clf = GridSearchCV(svr, parameters)

gs = GridSearchCV(LinearSVC(),{'C':np.logspace(-3., 3., 10)})
# GridSearch will look through all of your parameters 

# iris = datasets.load_iris()
# parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
# svr = svm.SVC()
# clf = grid_search.GridSearchCV(svr, parameters)
# clf.fit(iris.data, iris.target)

fit/predict.



In [ ]:

    
gs.fit(X_train, y_train)

What are the best parameters?

Is the result any good?



In [ ]:

    
gs.best_params_

Create estimator with Non-Linear kernel

For this you will use sklearn.svm.SVC
Let's first make sure we understand how to read the documentation:

What is the default kernel for SVC?
What other kernels are available in the scikit SVC API?

Implement an SVM classifier using the defaults and fit to our data:



In [ ]:

    
from sklearn.svm import SVC

svcL = SVC()
svcL.fit(X_train, y_train)

Plot the decision function



In [ ]:

    
plot_decision_function(svcL,df[[1,2]].values,df[13].values)

Now let's put it all together

Do a grid search for values of C and gamma and plot the BEST decision function



In [ ]:

    
param = {'C':np.logspace(-3., 3., 10),'gamma':np.logspace(-3., 3., 10)}
# np.logspace(-3., 3., 10) is pretty much the standard logspace... its just a best practice
gsL = GridSearchCV(SVC(),param)

gsL.fit(X_train, y_train)



In [ ]:

    
plot_decision_function(gsL.best_estimator_, df[[1,2]].values,df[13].values)



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	2	7	10	14
count	690.000000	690.000000	690.00000	690.000000
mean	4.758725	2.223406	2.40000	1017.385507
std	4.978163	3.346513	4.86294	5210.102598
min	0.000000	0.000000	0.00000	0.000000
25%	1.000000	0.165000	0.00000	0.000000
50%	2.750000	1.000000	0.00000	5.000000
75%	7.207500	2.625000	3.00000	395.500000
max	28.000000	28.500000	67.00000	100000.000000

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15
83	a	NaN	3.500	u	g	d	v	3.000	t	f	0	t	g	300	0	-
86	b	NaN	0.375	u	g	d	v	0.875	t	f	0	t	s	928	0	-
92	b	NaN	5.000	y	p	aa	v	8.500	t	f	0	f	g	0	0	-
97	b	NaN	0.500	u	g	c	bb	0.835	t	f	0	t	s	320	0	-
254	b	NaN	0.625	u	g	k	v	0.250	f	f	0	f	g	380	2010	-
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
445	a	NaN	11.250	u	g	ff	ff	0.000	f	f	0	f	g	NaN	5200	-
450	b	NaN	3.000	y	p	i	bb	7.000	f	f	0	f	g	0	1	-
500	b	NaN	4.000	u	g	x	v	5.000	t	t	3	t	g	290	2279	+
515	b	NaN	10.500	u	g	x	v	6.500	t	f	0	f	g	0	0	+
608	b	NaN	0.040	y	p	d	v	4.250	f	f	0	t	g	460	0	-

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15
83	a	22.735150	3.500	u	g	d	v	3.000	t	f	0	t	g	300	0	-
86	b	38.986139	0.375	u	g	d	v	0.875	t	f	0	t	s	928	0	-
92	b	25.945669	5.000	y	p	aa	v	8.500	t	f	0	f	g	0	0	-
97	b	35.904605	0.500	u	g	c	bb	0.835	t	f	0	t	s	320	0	-
254	b	49.142239	0.625	u	g	k	v	0.250	f	f	0	f	g	380	2010	-
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
445	a	31.811619	11.250	u	g	ff	ff	0.000	f	f	0	f	g	NaN	5200	-
450	b	44.710324	3.000	y	p	i	bb	7.000	f	f	0	f	g	0	1	-
500	b	41.532193	4.000	u	g	x	v	5.000	t	t	3	t	g	290	2279	+
515	b	27.462976	10.500	u	g	x	v	6.500	t	f	0	f	g	0	0	+
608	b	28.564390	0.040	y	p	d	v	4.250	f	f	0	t	g	460	0	-

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15
71	b	34.830000	4.000	u	g	d	bb	12.500	t	f	0	t	g	NaN	0	-
202	b	24.830000	2.750	u	g	c	v	2.250	t	t	6	f	g	NaN	600	+
206	a	71.580000	0.000	?	?	?	?	0.000	f	f	0	f	p	NaN	0	+
243	a	18.750000	7.500	u	g	q	v	2.710	t	t	5	f	g	NaN	26726	+
270	b	37.580000	0.000	?	?	?	?	0.000	f	f	0	f	p	NaN	0	+
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
445	a	31.811619	11.250	u	g	ff	ff	0.000	f	f	0	f	g	NaN	5200	-
456	b	34.580000	0.000	?	?	?	?	0.000	f	f	0	f	p	NaN	0	-
592	b	23.170000	0.000	?	?	?	?	0.000	f	f	0	f	p	NaN	0	+
622	a	25.580000	0.000	?	?	?	?	0.000	f	f	0	f	p	NaN	0	+
626	b	22.000000	7.835	y	p	i	bb	0.165	f	f	0	t	g	NaN	0	-

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	A1_map
0	b	30.83	0.000	u	g	w	v	1.25	t	t	1	f	g	202	0	+	1
1	a	58.67	4.460	u	g	q	h	3.04	t	t	6	f	g	43	560	+	0
2	a	24.50	0.500	u	g	q	h	1.50	t	f	0	f	g	280	824	+	0
3	b	27.83	1.540	u	g	w	v	3.75	t	t	5	t	g	100	3	+	1
4	b	20.17	5.625	u	g	w	v	1.71	t	f	0	f	s	120	0	+	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
685	b	21.08	10.085	y	p	e	h	1.25	f	f	0	f	g	260	0	-	1
686	a	22.67	0.750	u	g	c	v	2.00	f	t	2	t	g	200	394	-	0
687	a	25.25	13.500	y	p	ff	ff	2.00	f	t	1	t	g	200	1	-	0
688	b	17.92	0.205	u	g	aa	v	0.04	f	f	0	f	g	280	750	-	1
689	b	35.00	3.375	u	g	c	h	8.29	f	f	0	t	g	0	0	-	1

	1	2	7	10	13	14	A1_map
count	681.000000	681.000000	681.000000	681.000000	681.000000	681.000000	671.000000
mean	31.486823	4.806990	2.251872	2.431718	184.088781	1030.330396	0.690015
std	11.722445	4.988754	3.359283	4.887119	173.668616	5243.226720	0.462832
min	13.750000	0.000000	0.000000	0.000000	-77.553126	0.000000	0.000000
25%	22.670000	1.000000	0.165000	0.000000	75.000000	0.000000	0.000000
50%	28.500000	2.875000	1.000000	0.000000	160.000000	5.000000	1.000000
75%	38.250000	7.500000	2.750000	3.000000	276.000000	400.000000	1.000000
max	76.750000	28.000000	28.500000	67.000000	2000.000000	100000.000000	1.000000

	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	A1_map
0
a	496	496	496	496	496	496	496	496	496	496	496	496	496	496	496	489
b	185	185	185	185	185	185	185	185	185	185	185	185	185	185	185	182

	0	1
0	1	[0.000349237189033, -0.0596830401286, -0.06050...
1	2	[-0.0193477998056, -0.0828988876214, -0.159675...
2	13	[0.000140609841843, -0.000249876112965, 0.0016...

	No	Yes	2	3	4	5	6	7	8	9	...	194	195	196	197	198	199	200	201	y_true
132	0.876669	0.013997	0.004086	0.001842	0.005644	0.000215	0.008912	0.000026	0.000009	0.000001	...	0.000005	1.365825e-05	0.000003	1.984458e-08	0.000107	0.000006	0.000033	2.471840e-16	58
133	0.356034	0.049803	0.025432	0.009886	0.004566	0.013212	0.018420	0.007220	0.005184	0.002970	...	0.005170	4.237872e-12	0.003936	5.991805e-04	0.002887	0.004901	0.004478	6.214841e-07	4
134	0.694512	0.054400	0.032842	0.004486	0.001208	0.007457	0.018237	0.003349	0.001204	0.000709	...	0.001510	7.041957e-18	0.000913	9.942237e-05	0.000499	0.001710	0.002044	1.282210e-08	122
135	0.647337	0.046661	0.014449	0.009688	0.007861	0.007543	0.019930	0.003124	0.000682	0.000191	...	0.000243	4.044420e-08	0.000201	1.097291e-04	0.000869	0.000528	0.002004	2.225272e-08	0
136	0.480583	0.057887	0.026718	0.007609	0.001819	0.019792	0.018404	0.013686	0.002638	0.001846	...	0.001504	1.688421e-15	0.001214	1.731271e-03	0.000721	0.003270	0.006408	9.910387e-06	0

	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	A1_map
0
a	496	496	496	496	496	496	496	496	496	496	496	496	496	496	496	489
b	185	185	185	185	185	185	185	185	185	185	185	185	185	185	185	182