In [1]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [31]:

    
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

from scipy import stats, optimize
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.pipeline import Pipeline

from sklearn.base import clone
from itertools import combinations
from sklearn.metrics import explained_variance_score, r2_score, median_absolute_error

print('The scikit-learn version is {}.'.format(sklearn.__version__))
print('The pandas version is {}.'.format(pd.__version__))
print('The numpy version is {}.'.format(np.__version__))









    



The scikit-learn version is 0.18.1.
The pandas version is 0.19.2.
The numpy version is 1.12.0.

Read the CSV

We use pandas read_csv(path/to/csv) method to read the csv file. Next, replace the missing values with np.NaN i.e. Not a Number. This way we can count the number of missing values per column.

The dataset as described on UC Irvine repo. (125 predictive, 4 non-predictive, 18 potential goal)

We remove the features which are final goals and some other irrelevant features. For example the following attribute is to be predicted. murdPerPop: number of murders per 100K population (numeric - decimal) potential GOAL attribute (to be predicted)



In [24]:

    
df = pd.read_csv('../datasets/UCIrvineCrimeData.csv');
df = df.replace('?',np.NAN)
features = [x for x in df.columns if x not in ['fold', 'state', 'community', 'communityname', 'county'
                                               ,'ViolentCrimesPerPop']]

Find the number of missing values in every column



In [25]:

    
df.isnull().sum()









    Out[25]:





state                      0
county                  1174
community               1177
communityname              0
fold                       0
population                 0
householdsize              0
racepctblack               0
racePctWhite               0
racePctAsian               0
racePctHisp                0
agePct12t21                0
agePct12t29                0
agePct16t24                0
agePct65up                 0
numbUrban                  0
pctUrban                   0
medIncome                  0
pctWWage                   0
pctWFarmSelf               0
pctWInvInc                 0
pctWSocSec                 0
pctWPubAsst                0
pctWRetire                 0
medFamInc                  0
perCapInc                  0
whitePerCap                0
blackPerCap                0
indianPerCap               0
AsianPerCap                0
                        ... 
PctSameHouse85             0
PctSameCity85              0
PctSameState85             0
LemasSwornFT            1675
LemasSwFTPerPop         1675
LemasSwFTFieldOps       1675
LemasSwFTFieldPerPop    1675
LemasTotalReq           1675
LemasTotReqPerPop       1675
PolicReqPerOffic        1675
PolicPerPop             1675
RacialMatchCommPol      1675
PctPolicWhite           1675
PctPolicBlack           1675
PctPolicHisp            1675
PctPolicAsian           1675
PctPolicMinor           1675
OfficAssgnDrugUnits     1675
NumKindsDrugsSeiz       1675
PolicAveOTWorked        1675
LandArea                   0
PopDens                    0
PctUsePubTrans             0
PolicCars               1675
PolicOperBudg           1675
LemasPctPolicOnPatr     1675
LemasGangUnitDeploy     1675
LemasPctOfficDrugUn        0
PolicBudgPerPop         1675
ViolentCrimesPerPop        0
dtype: int64

Eliminating samples or features with missing values

One of the easiest ways to deal with missing values is to simply remove the corresponding features(columns) or samples(rows) from the dataset entirely. Rows with missing values can be easily dropped via the dropna method.



In [5]:

    
df.dropna()









    Out[5]:






  
    
      
      state
      county
      community
      communityname
      fold
      population
      householdsize
      racepctblack
      racePctWhite
      racePctAsian
      ...
      LandArea
      PopDens
      PctUsePubTrans
      PolicCars
      PolicOperBudg
      LemasPctPolicOnPatr
      LemasGangUnitDeploy
      LemasPctOfficDrugUn
      PolicBudgPerPop
      ViolentCrimesPerPop
    
  
  
    
      16
      36
      1
      1000
      Albanycity
      1
      0.15
      0.31
      0.40
      0.63
      0.14
      ...
      0.06
      0.39
      0.84
      0.06
      0.06
      0.91
      0.5
      0.88
      0.26
      0.49
    
    
      23
      19
      193
      93926
      SiouxCitycity
      1
      0.11
      0.43
      0.04
      0.89
      0.09
      ...
      0.16
      0.12
      0.07
      0.04
      0.01
      0.81
      1
      0.56
      0.09
      0.63
    
    
      33
      51
      680
      47672
      Lynchburgcity
      1
      0.09
      0.43
      0.51
      0.58
      0.04
      ...
      0.14
      0.11
      0.19
      0.05
      0.01
      0.75
      0
      0.60
      0.1
      0.31
    
    
      68
      34
      23
      58200
      PerthAmboycity
      1
      0.05
      0.59
      0.23
      0.39
      0.09
      ...
      0.01
      0.73
      0.28
      0
      0.02
      0.64
      0
      1.00
      0.23
      0.50
    
    
      74
      9
      9
      46520
      Meridentown
      1
      0.08
      0.39
      0.08
      0.85
      0.04
      ...
      0.07
      0.21
      0.04
      0.02
      0.01
      0.7
      1
      0.44
      0.11
      0.14
    
    
      80
      36
      119
      84000
      Yonkerscity
      1
      0.29
      0.42
      0.27
      0.64
      0.18
      ...
      0.05
      0.87
      1.00
      0.21
      0.14
      0.69
      0.5
      0.61
      0.34
      0.30
    
    
      95
      25
      17
      37875
      Maldencity
      1
      0.07
      0.34
      0.08
      0.84
      0.32
      ...
      0.01
      0.89
      1.00
      0
      0.01
      0.88
      0
      0.35
      0.09
      0.37
    
    
      99
      33
      11
      45140
      Manchestercity
      1
      0.14
      0.34
      0.02
      0.96
      0.07
      ...
      0.09
      0.25
      0.08
      0.01
      0.02
      0.79
      0
      0.44
      0.1
      0.09
    
    
      134
      34
      31
      13690
      Cliftoncity
      1
      0.10
      0.34
      0.03
      0.89
      0.21
      ...
      0.03
      0.53
      0.32
      0.05
      0.02
      0.75
      0
      1.00
      0.12
      0.08
    
    
      145
      25
      17
      37000
      Lowellcity
      1
      0.15
      0.51
      0.05
      0.71
      0.68
      ...
      0.04
      0.63
      0.16
      0
      0.02
      0.68
      0
      1.00
      0.12
      0.53
    
    
      155
      34
      31
      56550
      Passaiccity
      1
      0.08
      0.67
      0.40
      0.16
      0.43
      ...
      0.01
      1.00
      0.70
      0.06
      0.02
      0.86
      0
      0.70
      0.18
      0.44
    
    
      174
      25
      9
      37490
      Lynncity
      1
      0.11
      0.40
      0.16
      0.74
      0.23
      ...
      0.03
      0.63
      0.50
      0.04
      0.02
      0.86
      1
      0.41
      0.15
      0.73
    
    
      175
      34
      23
      82000
      Woodbridgetownship
      1
      0.13
      0.51
      0.13
      0.80
      0.34
      ...
      0.07
      0.34
      0.39
      0.02
      0.02
      0.84
      0
      0.36
      0.1
      0.17
    
    
      177
      34
      17
      36000
      JerseyCitycity
      1
      0.35
      0.50
      0.58
      0.21
      0.70
      ...
      0.04
      1.00
      1.00
      0.29
      0.19
      0.49
      0
      0.55
      0.36
      0.83
    
    
      222
      42
      71
      41216
      Lancastercity
      2
      0.07
      0.42
      0.24
      0.56
      0.12
      ...
      0.02
      0.63
      0.31
      0.02
      0.01
      0.87
      0
      0.00
      0.14
      0.36
    
    
      227
      44
      7
      54640
      Pawtucketcity
      2
      0.10
      0.33
      0.07
      0.84
      0.04
      ...
      0.02
      0.70
      0.18
      0.06
      0.01
      0.84
      0.5
      1.00
      0.1
      0.19
    
    
      233
      9
      9
      52070
      NewHaventown
      2
      0.19
      0.44
      0.70
      0.29
      0.15
      ...
      0.05
      0.58
      0.53
      0.09
      0.05
      0.76
      0.5
      0.32
      0.21
      0.88
    
    
      262
      51
      760
      67000
      Richmondcity
      2
      0.31
      0.30
      1.00
      0.13
      0.05
      ...
      0.17
      0.28
      0.72
      0.28
      0.1
      0.79
      0.5
      0.48
      0.22
      0.67
    
    
      277
      9
      1
      74190
      Stratfordtown
      2
      0.06
      0.39
      0.15
      0.85
      0.05
      ...
      0.05
      0.23
      0.11
      0.01
      0.01
      0.93
      0
      0.51
      0.13
      0.08
    
    
      293
      42
      77
      2000
      Allentowncity
      2
      0.15
      0.34
      0.10
      0.79
      0.08
      ...
      0.05
      0.50
      0.19
      0.04
      0.02
      0.86
      0
      0.72
      0.08
      0.28
    
    
      312
      34
      17
      3580
      Bayonnecity
      2
      0.08
      0.32
      0.09
      0.86
      0.11
      ...
      0.01
      0.91
      0.96
      0.04
      0.03
      0.79
      0
      0.49
      0.23
      0.20
    
    
      313
      39
      151
      12000
      Cantoncity
      2
      0.12
      0.37
      0.35
      0.71
      0.02
      ...
      0.06
      0.35
      0.14
      0.07
      0.02
      0.66
      0
      0.93
      0.12
      0.51
    
    
      315
      36
      63
      51055
      NiagaraFallscity
      2
      0.08
      0.30
      0.30
      0.73
      0.01
      ...
      0.04
      0.37
      0.17
      0.04
      0.02
      0.62
      0
      0.66
      0.16
      0.30
    
    
      338
      9
      9
      82870
      WestHaventown
      2
      0.07
      0.38
      0.24
      0.76
      0.12
      ...
      0.03
      0.42
      0.19
      0.02
      0.01
      0.93
      0
      0.58
      0.15
      0.12
    
    
      345
      55
      9
      31000
      GreenBaycity
      2
      0.14
      0.36
      0.01
      0.92
      0.14
      ...
      0.13
      0.18
      0.12
      0.01
      0.02
      0.36
      1
      0.80
      0.1
      0.18
    
    
      366
      42
      49
      24000
      Eriecity
      2
      0.16
      0.40
      0.23
      0.79
      0.03
      ...
      0.06
      0.41
      0.21
      0.06
      0.01
      0.76
      0
      0.43
      0.06
      0.32
    
    
      380
      42
      11
      63624
      Readingcity
      2
      0.11
      0.36
      0.19
      0.68
      0.09
      ...
      0.03
      0.67
      0.41
      0.04
      0.02
      0.67
      1
      0.65
      0.14
      0.49
    
    
      384
      34
      39
      21000
      Elizabethcity
      2
      0.16
      0.52
      0.39
      0.47
      0.17
      ...
      0.03
      0.75
      0.59
      0.08
      0.05
      0.84
      1
      0.76
      0.22
      0.54
    
    
      401
      55
      79
      53000
      Milwaukeecity
      3
      0.99
      0.42
      0.59
      0.44
      0.11
      ...
      0.28
      0.55
      0.62
      0.37
      0.38
      0.57
      0.5
      0.24
      0.25
      0.40
    
    
      413
      24
      510
      4000
      Baltimorecity
      3
      1.00
      0.44
      1.00
      0.07
      0.06
      ...
      0.23
      0.76
      1.00
      0.67
      0.58
      0.74
      0
      0.36
      0.34
      1.00
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      1441
      36
      55
      63000
      Rochestercity
      8
      0.36
      0.34
      0.61
      0.41
      0.11
      ...
      0.10
      0.54
      0.60
      0.3
      0.18
      0.73
      1
      0.56
      0.34
      0.48
    
    
      1445
      34
      13
      47500
      Montclairtownship
      8
      0.04
      0.41
      0.60
      0.47
      0.14
      ...
      0.02
      0.50
      1.00
      0
      0.01
      0.91
      0
      0.42
      0.15
      0.21
    
    
      1476
      42
      69
      69000
      Scrantoncity
      8
      0.12
      0.36
      0.03
      0.96
      0.05
      ...
      0.07
      0.27
      0.17
      0.04
      0.01
      0.98
      0
      0.59
      0.07
      0.12
    
    
      1494
      34
      17
      79610
      WestNewYorktown
      8
      0.05
      0.43
      0.08
      0.65
      0.12
      ...
      0.00
      1.00
      1.00
      0
      0.01
      0.77
      0
      0.51
      0.18
      0.23
    
    
      1511
      34
      17
      32250
      Hobokencity
      8
      0.04
      0.21
      0.11
      0.68
      0.27
      ...
      0.00
      1.00
      1.00
      0.02
      0.02
      0.7
      0
      1.00
      0.32
      0.32
    
    
      1512
      36
      67
      73000
      Syracusecity
      8
      0.25
      0.37
      0.40
      0.62
      0.13
      ...
      0.07
      0.55
      0.61
      0.14
      0.1
      0.45
      0
      0.78
      0.27
      0.35
    
    
      1544
      27
      53
      43000
      Minneapoliscity
      8
      0.58
      0.25
      0.25
      0.67
      0.26
      ...
      0.16
      0.56
      0.92
      0.31
      0.15
      0.09
      0.5
      0.22
      0.17
      0.75
    
    
      1555
      25
      17
      39835
      Medfordcity
      8
      0.08
      0.43
      0.08
      0.90
      0.12
      ...
      0.02
      0.59
      0.94
      0
      0.01
      0.34
      0
      0.52
      0.15
      0.32
    
    
      1575
      51
      550
      16000
      Chesapeakecity
      8
      0.23
      0.58
      0.53
      0.55
      0.08
      ...
      0.99
      0.04
      0.06
      0.24
      0.02
      0.71
      1
      0.47
      0.05
      0.18
    
    
      1594
      34
      29
      38550
      Lakewoodtownship
      8
      0.06
      0.49
      0.27
      0.69
      0.09
      ...
      0.07
      0.15
      0.22
      0.02
      0.02
      0.78
      1
      0.66
      0.23
      0.24
    
    
      1626
      9
      3
      50440
      NewBritaintown
      9
      0.11
      0.36
      0.15
      0.72
      0.11
      ...
      0.04
      0.47
      0.17
      0.03
      0.02
      0.62
      0
      0.72
      0.18
      0.30
    
    
      1633
      36
      29
      11000
      Buffalocity
      9
      0.51
      0.31
      0.60
      0.46
      0.06
      ...
      0.12
      0.68
      0.75
      0.25
      0.32
      0.86
      0.5
      0.33
      0.44
      0.80
    
    
      1640
      9
      1
      8070
      Bridgeporttown
      9
      0.21
      0.47
      0.52
      0.37
      0.14
      ...
      0.04
      0.74
      0.36
      0.1
      0.06
      0
      0
      0.63
      0.21
      0.79
    
    
      1643
      33
      11
      50260
      Nashuacity
      9
      0.11
      0.40
      0.03
      0.93
      0.12
      ...
      0.09
      0.21
      0.06
      0.02
      0.03
      0.69
      1
      0.52
      0.16
      0.03
    
    
      1645
      50
      7
      10675
      Burlingtoncity
      9
      0.05
      0.45
      0.02
      0.96
      0.09
      ...
      0.03
      0.31
      0.19
      0.01
      0.01
      0.8
      0
      0.23
      0.22
      0.07
    
    
      1716
      51
      810
      82000
      VirginiaBeachcity
      9
      0.62
      0.57
      0.27
      0.71
      0.26
      ...
      0.72
      0.13
      0.04
      0.4
      0.12
      0.8
      1
      0.81
      0.1
      0.12
    
    
      1751
      51
      700
      56000
      NewportNewscity
      9
      0.26
      0.44
      0.65
      0.43
      0.14
      ...
      0.20
      0.21
      0.16
      0.2
      0.04
      0.76
      1
      0.82
      0.1
      0.35
    
    
      1777
      51
      710
      57000
      Norfolkcity
      9
      0.40
      0.58
      0.76
      0.34
      0.16
      ...
      0.15
      0.41
      0.31
      0.18
      0.08
      0.75
      0.5
      0.79
      0.14
      0.47
    
    
      1810
      34
      23
      51210
      NewBrunswickcity
      10
      0.05
      0.77
      0.58
      0.35
      0.24
      ...
      0.01
      0.67
      0.69
      0.03
      0.02
      0.81
      0
      0.50
      0.23
      0.62
    
    
      1835
      34
      31
      57000
      Patersoncity
      10
      0.21
      0.73
      0.70
      0.10
      0.09
      ...
      0.02
      1.00
      0.59
      0.1
      0.04
      0.75
      0
      1.00
      0.13
      0.65
    
    
      1839
      9
      3
      22630
      EastHartfordtown
      10
      0.06
      0.35
      0.16
      0.80
      0.13
      ...
      0.05
      0.23
      0.33
      0.02
      0.02
      0.79
      0
      0.65
      0.2
      0.16
    
    
      1844
      34
      23
      19000
      EastBrunswicktownship
      10
      0.05
      0.58
      0.04
      0.82
      0.56
      ...
      0.06
      0.16
      0.56
      0.04
      0.01
      0.77
      0
      0.23
      0.14
      0.06
    
    
      1847
      34
      13
      51000
      Newarkcity
      10
      0.43
      0.62
      1.00
      0.00
      0.07
      ...
      0.07
      0.97
      1.00
      0.02
      0.03
      0.8
      0
      0.00
      0.05
      1.00
    
    
      1859
      55
      101
      66000
      Racinecity
      10
      0.12
      0.44
      0.36
      0.64
      0.03
      ...
      0.04
      0.46
      0.19
      0.03
      0.03
      0.69
      0.5
      0.61
      0.19
      0.34
    
    
      1878
      25
      9
      34550
      Lawrencecity
      10
      0.10
      0.56
      0.12
      0.47
      0.12
      ...
      0.02
      0.84
      0.15
      0
      0.02
      0.54
      0
      0.93
      0.14
      0.88
    
    
      1880
      34
      39
      40350
      Lindencity
      10
      0.04
      0.39
      0.39
      0.65
      0.09
      ...
      0.03
      0.28
      0.32
      0.02
      0.01
      0.85
      0
      0.99
      0.19
      0.22
    
    
      1963
      36
      27
      59641
      Poughkeepsiecity
      10
      0.03
      0.32
      0.61
      0.47
      0.09
      ...
      0.01
      0.47
      0.42
      0.07
      0.08
      0.49
      0
      0.37
      1
      0.45
    
    
      1981
      9
      9
      35650
      Hamdentown
      10
      0.07
      0.38
      0.17
      0.84
      0.11
      ...
      0.09
      0.13
      0.17
      0.02
      0.01
      0.72
      0
      0.62
      0.15
      0.07
    
    
      1991
      9
      9
      80070
      Waterburytown
      10
      0.16
      0.37
      0.25
      0.69
      0.04
      ...
      0.08
      0.32
      0.18
      0.08
      0.06
      0.78
      0
      0.91
      0.28
      0.23
    
    
      1992
      25
      17
      72600
      Walthamcity
      10
      0.08
      0.51
      0.06
      0.87
      0.22
      ...
      0.03
      0.38
      0.33
      0.02
      0.02
      0.79
      0
      0.22
      0.18
      0.19
    
  

123 rows × 128 columns

Similarly, we can drop columns that have atleast one NaN in any row by setting the axis argument to 1:



In [6]:

    
df.dropna(axis=1);

The dropna() method supports additional parameters that can come in handy.



In [7]:

    
#only drop rows where all columns are null
df.dropna(how='all');



In [8]:

    
# drop rows that have not at least 4 non-NaN values
df.dropna(thresh=4);



In [9]:

    
# only drop rows where NaN appear in specific columns (here :'community')
df.dropna(subset=['community']);

Imputing missing values

Often, the removal of samples or dropping of entire feature columns is simply not feasible, because we might lost too much valuable data. In this case, we can use different interpolation techniques to estimate the missing values from the othere training samples in our dataset. One of the most common interpolation technique is mean interpolation, where we simply replace the missing value by the mean value of the entire feature column. A convenient way to achieve this is using the Imputer class from the scikit-learn as shown in the following code.



In [10]:

    
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(df[features])
imputed_data = imr.transform(df[features]);

Sklearn fundamentals

A convenient way to randomly partition the dataset into a separate test & training dataset is to use the train_test_split function from scikit-learn's cross_validation submodule. As of now, the target variable is ''



In [11]:

    
#df = df.drop(["communityname", "state", "county", "community"], axis=1)
X, y = imputed_data, df['ViolentCrimesPerPop']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0);

First, we assigned the NumPy array representation of features columns to the variable X, and we assigned the predicted variable to the variable y. Then we used the train_test_split function to randomly split X and y into separate training & test datasets. By setting test_size=0.3 we assigned 30 percent of samples to X_test and the remaining 70 percent to X_train.

Sequential Feature Selection algorithm : Sequential Backward Algorithm(SBS)

Sequential feature selection algorithms are a family of greedy search algorithms that can reduce an initial d-dimensional feature space into a k-dimensional feature subspace where k < d. The idea is to select the most relevant subset of features to improve computational efficieny and reduce generalization error



In [12]:

    
class SBS():
    def __init__(self, estimator, features, 
                 scoring=r2_score, test_size=0.25,
                random_state=1):
        self.scoring = scoring
        self.estimator = estimator
        self.features = features
        self.test_size = test_size
        self.random_state = random_state
    
    def fit(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(X, 
                                                           y, 
                                                           test_size = self.test_size,
                                                           random_state = self.random_state)
        dim = X_train.shape[1]
        self.indices_ = tuple(range(dim))
        self.subsets_ = [self.indices_]
        score = self._calc_score(X_train, y_train, X_test, y_test, self.indices_)
        self.scores_ = [score]
        
        while dim > self.features:
            scores = []
            subsets = []
            for p in combinations(self.indices_, r=dim-1):
                score = self._calc_score(X_train, y_train, X_test, y_test, p)
                scores.append(score)
                subsets.append(p)
            best = np.argmax(score)
            self.indices_ = subsets[best]
            self.subsets_.append(self.indices_)
            dim -= 1
            self.scores_.append(scores[best])
        self.k_score_ = self.scores_[-1]
        return self
    
    def transform(self, X):
        return X[:, self.indices_]
    
    def _calc_score(self, X_train, y_train, X_test, y_test, indices):
        self.estimator.fit(X_train[:, indices], y_train)
        y_pred = self.estimator.predict(X_test[:, indices])
        score = self.scoring(y_test, y_pred)
        return score

In the preceding implementation, we define k_features to be the final number of features when the algorithm should stop. Inside the while loop of the fit method, the feature subsets created by the itertools.combination function are evaluated and reduced until the feature subset has the desired dimensionality. In each iteration, the accuracy score of the best subset is collected in a list self.scores_ based on the internally created test dataset X_test. We will use thoese scores to evaluate the results.

Let's see our implementation using the LinearRegression



In [13]:

    
clf = LinearRegression() #This can also be replaced by Lasso() or Ridge()
sbs = SBS(clf, features=1)
sbs.fit(X_train, y_train)









    Out[13]:





<__main__.SBS instance at 0x7efff18d02d8>



In [28]:

    
k_feat = [len(k) for k in sbs.subsets_]
plt.plot(k_feat, sbs.scores_, marker='o')
plt.ylim([-2, 3])
plt.ylabel('Accuracy')
plt.xlabel('Number of Features')
plt.grid()
plt.show()

I have used r2_score as a measurement of accuracy. R^2 (coefficient of determination) regression score function. Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.

Also, please note that the accuracy scores are reported on the validation set.

Let us see which 100 features give us the best score of 0.75 on the validation set



In [29]:

    
k100 = sbs.subsets_[23]



In [30]:

    
features100 = [features[x] for x in k100]
print features100









    



['population', 'householdsize', 'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp', 'agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', 'numbUrban', 'pctUrban', 'medIncome', 'pctWWage', 'pctWFarmSelf', 'pctWInvInc', 'pctWSocSec', 'pctWPubAsst', 'pctWRetire', 'medFamInc', 'perCapInc', 'whitePerCap', 'blackPerCap', 'indianPerCap', 'AsianPerCap', 'OtherPerCap', 'HispPerCap', 'NumUnderPov', 'PctPopUnderPov', 'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore', 'PctUnemployed', 'PctEmploy', 'PctEmplManu', 'PctEmplProfServ', 'PctOccupManu', 'PctOccupMgmtProf', 'MalePctDivorce', 'MalePctNevMarr', 'FemalePctDiv', 'TotalPctDiv', 'PersPerFam', 'PctFam2Par', 'PctKids2Par', 'PctYoungKids2Par', 'PctTeen2Par', 'PctWorkMomYoungKids', 'PctWorkMom', 'NumIlleg', 'PctIlleg', 'NumImmig', 'PctImmigRecent', 'PctImmigRec5', 'PctImmigRec8', 'PctImmigRec10', 'PctRecentImmig', 'PctRecImmig5', 'PctRecImmig8', 'PctRecImmig10', 'PctSpeakEnglOnly', 'PctNotSpeakEnglWell', 'PctLargHouseFam', 'PctLargHouseOccup', 'PersPerOccupHous', 'PersPerOwnOccHous', 'PersPerRentOccHous', 'PctPersOwnOccup', 'PctPersDenseHous', 'PctHousLess3BR', 'MedNumBR', 'HousVacant', 'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos', 'MedYrHousBuilt', 'PctHousNoPhone', 'PctWOFullPlumb', 'OwnOccLowQuart', 'OwnOccMedVal', 'OwnOccHiQuart', 'RentLowQ', 'RentMedian', 'RentHighQ', 'MedRent', 'MedRentPctHousInc', 'MedOwnCostPctInc', 'MedOwnCostPctIncNoMtg', 'NumInShelters', 'NumStreet', 'PctForeignBorn', 'PctBornSameState', 'PctSameHouse85', 'PctSameCity85', 'PctSameState85', 'LemasSwornFT', 'LemasSwFTPerPop', 'LemasSwFTFieldOps']



In [45]:

    
plt.figure(figsize=(20, 6))
classifiers = [LinearRegression(), Lasso(), Ridge()]
names = ["Linear Regression", "Lasso", "Ridge"]
for i, classifier, name in zip(range(len(classifiers)), classifiers, names):
    ax = plt.subplot(1, len(classifiers), i+1)
    ax.set_ylim([-1, 1])
    plt.setp(ax.get_yticklabels(), visible=True)
    plt.setp(ax.get_xticklabels(), visible=True)
    sbs = SBS(classifier, features=1)
    sbs.fit(X_train, y_train)
    k_feat = [len(k) for k in sbs.subsets_]
    plt.plot(k_feat, sbs.scores_, marker='o', label="Accuracy")
    plt.ylabel('Accuracy')
    plt.xlabel('Number of Features')
    plt.grid()
    plt.legend(loc="best")
    plt.title("For {} min-score = {:.2e} & max-score={:.2e}".format(name, min(sbs.scores_), max(sbs.scores_)))
plt.show()



In [41]:









    Out[41]:





0.15187618504340605



In [ ]:

	state	county	community	communityname	fold	population	householdsize	racepctblack	racePctWhite	racePctAsian	...	LandArea	PopDens	PctUsePubTrans	PolicCars	PolicOperBudg	LemasPctPolicOnPatr	LemasGangUnitDeploy	LemasPctOfficDrugUn	PolicBudgPerPop	ViolentCrimesPerPop
16	36	1	1000	Albanycity	1	0.15	0.31	0.40	0.63	0.14	...	0.06	0.39	0.84	0.06	0.06	0.91	0.5	0.88	0.26	0.49
23	19	193	93926	SiouxCitycity	1	0.11	0.43	0.04	0.89	0.09	...	0.16	0.12	0.07	0.04	0.01	0.81	1	0.56	0.09	0.63
33	51	680	47672	Lynchburgcity	1	0.09	0.43	0.51	0.58	0.04	...	0.14	0.11	0.19	0.05	0.01	0.75	0	0.60	0.1	0.31
68	34	23	58200	PerthAmboycity	1	0.05	0.59	0.23	0.39	0.09	...	0.01	0.73	0.28	0	0.02	0.64	0	1.00	0.23	0.50
74	9	9	46520	Meridentown	1	0.08	0.39	0.08	0.85	0.04	...	0.07	0.21	0.04	0.02	0.01	0.7	1	0.44	0.11	0.14
80	36	119	84000	Yonkerscity	1	0.29	0.42	0.27	0.64	0.18	...	0.05	0.87	1.00	0.21	0.14	0.69	0.5	0.61	0.34	0.30
95	25	17	37875	Maldencity	1	0.07	0.34	0.08	0.84	0.32	...	0.01	0.89	1.00	0	0.01	0.88	0	0.35	0.09	0.37
99	33	11	45140	Manchestercity	1	0.14	0.34	0.02	0.96	0.07	...	0.09	0.25	0.08	0.01	0.02	0.79	0	0.44	0.1	0.09
134	34	31	13690	Cliftoncity	1	0.10	0.34	0.03	0.89	0.21	...	0.03	0.53	0.32	0.05	0.02	0.75	0	1.00	0.12	0.08
145	25	17	37000	Lowellcity	1	0.15	0.51	0.05	0.71	0.68	...	0.04	0.63	0.16	0	0.02	0.68	0	1.00	0.12	0.53
155	34	31	56550	Passaiccity	1	0.08	0.67	0.40	0.16	0.43	...	0.01	1.00	0.70	0.06	0.02	0.86	0	0.70	0.18	0.44
174	25	9	37490	Lynncity	1	0.11	0.40	0.16	0.74	0.23	...	0.03	0.63	0.50	0.04	0.02	0.86	1	0.41	0.15	0.73
175	34	23	82000	Woodbridgetownship	1	0.13	0.51	0.13	0.80	0.34	...	0.07	0.34	0.39	0.02	0.02	0.84	0	0.36	0.1	0.17
177	34	17	36000	JerseyCitycity	1	0.35	0.50	0.58	0.21	0.70	...	0.04	1.00	1.00	0.29	0.19	0.49	0	0.55	0.36	0.83
222	42	71	41216	Lancastercity	2	0.07	0.42	0.24	0.56	0.12	...	0.02	0.63	0.31	0.02	0.01	0.87	0	0.00	0.14	0.36
227	44	7	54640	Pawtucketcity	2	0.10	0.33	0.07	0.84	0.04	...	0.02	0.70	0.18	0.06	0.01	0.84	0.5	1.00	0.1	0.19
233	9	9	52070	NewHaventown	2	0.19	0.44	0.70	0.29	0.15	...	0.05	0.58	0.53	0.09	0.05	0.76	0.5	0.32	0.21	0.88
262	51	760	67000	Richmondcity	2	0.31	0.30	1.00	0.13	0.05	...	0.17	0.28	0.72	0.28	0.1	0.79	0.5	0.48	0.22	0.67
277	9	1	74190	Stratfordtown	2	0.06	0.39	0.15	0.85	0.05	...	0.05	0.23	0.11	0.01	0.01	0.93	0	0.51	0.13	0.08
293	42	77	2000	Allentowncity	2	0.15	0.34	0.10	0.79	0.08	...	0.05	0.50	0.19	0.04	0.02	0.86	0	0.72	0.08	0.28
312	34	17	3580	Bayonnecity	2	0.08	0.32	0.09	0.86	0.11	...	0.01	0.91	0.96	0.04	0.03	0.79	0	0.49	0.23	0.20
313	39	151	12000	Cantoncity	2	0.12	0.37	0.35	0.71	0.02	...	0.06	0.35	0.14	0.07	0.02	0.66	0	0.93	0.12	0.51
315	36	63	51055	NiagaraFallscity	2	0.08	0.30	0.30	0.73	0.01	...	0.04	0.37	0.17	0.04	0.02	0.62	0	0.66	0.16	0.30
338	9	9	82870	WestHaventown	2	0.07	0.38	0.24	0.76	0.12	...	0.03	0.42	0.19	0.02	0.01	0.93	0	0.58	0.15	0.12
345	55	9	31000	GreenBaycity	2	0.14	0.36	0.01	0.92	0.14	...	0.13	0.18	0.12	0.01	0.02	0.36	1	0.80	0.1	0.18
366	42	49	24000	Eriecity	2	0.16	0.40	0.23	0.79	0.03	...	0.06	0.41	0.21	0.06	0.01	0.76	0	0.43	0.06	0.32
380	42	11	63624	Readingcity	2	0.11	0.36	0.19	0.68	0.09	...	0.03	0.67	0.41	0.04	0.02	0.67	1	0.65	0.14	0.49
384	34	39	21000	Elizabethcity	2	0.16	0.52	0.39	0.47	0.17	...	0.03	0.75	0.59	0.08	0.05	0.84	1	0.76	0.22	0.54
401	55	79	53000	Milwaukeecity	3	0.99	0.42	0.59	0.44	0.11	...	0.28	0.55	0.62	0.37	0.38	0.57	0.5	0.24	0.25	0.40
413	24	510	4000	Baltimorecity	3	1.00	0.44	1.00	0.07	0.06	...	0.23	0.76	1.00	0.67	0.58	0.74	0	0.36	0.34	1.00
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1441	36	55	63000	Rochestercity	8	0.36	0.34	0.61	0.41	0.11	...	0.10	0.54	0.60	0.3	0.18	0.73	1	0.56	0.34	0.48
1445	34	13	47500	Montclairtownship	8	0.04	0.41	0.60	0.47	0.14	...	0.02	0.50	1.00	0	0.01	0.91	0	0.42	0.15	0.21
1476	42	69	69000	Scrantoncity	8	0.12	0.36	0.03	0.96	0.05	...	0.07	0.27	0.17	0.04	0.01	0.98	0	0.59	0.07	0.12
1494	34	17	79610	WestNewYorktown	8	0.05	0.43	0.08	0.65	0.12	...	0.00	1.00	1.00	0	0.01	0.77	0	0.51	0.18	0.23
1511	34	17	32250	Hobokencity	8	0.04	0.21	0.11	0.68	0.27	...	0.00	1.00	1.00	0.02	0.02	0.7	0	1.00	0.32	0.32
1512	36	67	73000	Syracusecity	8	0.25	0.37	0.40	0.62	0.13	...	0.07	0.55	0.61	0.14	0.1	0.45	0	0.78	0.27	0.35
1544	27	53	43000	Minneapoliscity	8	0.58	0.25	0.25	0.67	0.26	...	0.16	0.56	0.92	0.31	0.15	0.09	0.5	0.22	0.17	0.75
1555	25	17	39835	Medfordcity	8	0.08	0.43	0.08	0.90	0.12	...	0.02	0.59	0.94	0	0.01	0.34	0	0.52	0.15	0.32
1575	51	550	16000	Chesapeakecity	8	0.23	0.58	0.53	0.55	0.08	...	0.99	0.04	0.06	0.24	0.02	0.71	1	0.47	0.05	0.18
1594	34	29	38550	Lakewoodtownship	8	0.06	0.49	0.27	0.69	0.09	...	0.07	0.15	0.22	0.02	0.02	0.78	1	0.66	0.23	0.24
1626	9	3	50440	NewBritaintown	9	0.11	0.36	0.15	0.72	0.11	...	0.04	0.47	0.17	0.03	0.02	0.62	0	0.72	0.18	0.30
1633	36	29	11000	Buffalocity	9	0.51	0.31	0.60	0.46	0.06	...	0.12	0.68	0.75	0.25	0.32	0.86	0.5	0.33	0.44	0.80
1640	9	1	8070	Bridgeporttown	9	0.21	0.47	0.52	0.37	0.14	...	0.04	0.74	0.36	0.1	0.06	0	0	0.63	0.21	0.79
1643	33	11	50260	Nashuacity	9	0.11	0.40	0.03	0.93	0.12	...	0.09	0.21	0.06	0.02	0.03	0.69	1	0.52	0.16	0.03
1645	50	7	10675	Burlingtoncity	9	0.05	0.45	0.02	0.96	0.09	...	0.03	0.31	0.19	0.01	0.01	0.8	0	0.23	0.22	0.07
1716	51	810	82000	VirginiaBeachcity	9	0.62	0.57	0.27	0.71	0.26	...	0.72	0.13	0.04	0.4	0.12	0.8	1	0.81	0.1	0.12
1751	51	700	56000	NewportNewscity	9	0.26	0.44	0.65	0.43	0.14	...	0.20	0.21	0.16	0.2	0.04	0.76	1	0.82	0.1	0.35
1777	51	710	57000	Norfolkcity	9	0.40	0.58	0.76	0.34	0.16	...	0.15	0.41	0.31	0.18	0.08	0.75	0.5	0.79	0.14	0.47
1810	34	23	51210	NewBrunswickcity	10	0.05	0.77	0.58	0.35	0.24	...	0.01	0.67	0.69	0.03	0.02	0.81	0	0.50	0.23	0.62
1835	34	31	57000	Patersoncity	10	0.21	0.73	0.70	0.10	0.09	...	0.02	1.00	0.59	0.1	0.04	0.75	0	1.00	0.13	0.65
1839	9	3	22630	EastHartfordtown	10	0.06	0.35	0.16	0.80	0.13	...	0.05	0.23	0.33	0.02	0.02	0.79	0	0.65	0.2	0.16
1844	34	23	19000	EastBrunswicktownship	10	0.05	0.58	0.04	0.82	0.56	...	0.06	0.16	0.56	0.04	0.01	0.77	0	0.23	0.14	0.06
1847	34	13	51000	Newarkcity	10	0.43	0.62	1.00	0.00	0.07	...	0.07	0.97	1.00	0.02	0.03	0.8	0	0.00	0.05	1.00
1859	55	101	66000	Racinecity	10	0.12	0.44	0.36	0.64	0.03	...	0.04	0.46	0.19	0.03	0.03	0.69	0.5	0.61	0.19	0.34
1878	25	9	34550	Lawrencecity	10	0.10	0.56	0.12	0.47	0.12	...	0.02	0.84	0.15	0	0.02	0.54	0	0.93	0.14	0.88
1880	34	39	40350	Lindencity	10	0.04	0.39	0.39	0.65	0.09	...	0.03	0.28	0.32	0.02	0.01	0.85	0	0.99	0.19	0.22
1963	36	27	59641	Poughkeepsiecity	10	0.03	0.32	0.61	0.47	0.09	...	0.01	0.47	0.42	0.07	0.08	0.49	0	0.37	1	0.45
1981	9	9	35650	Hamdentown	10	0.07	0.38	0.17	0.84	0.11	...	0.09	0.13	0.17	0.02	0.01	0.72	0	0.62	0.15	0.07
1991	9	9	80070	Waterburytown	10	0.16	0.37	0.25	0.69	0.04	...	0.08	0.32	0.18	0.08	0.06	0.78	0	0.91	0.28	0.23
1992	25	17	72600	Walthamcity	10	0.08	0.51	0.06	0.87	0.22	...	0.03	0.38	0.33	0.02	0.02	0.79	0	0.22	0.18	0.19