In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [31]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

from scipy import stats, optimize
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.pipeline import Pipeline

from sklearn.base import clone
from itertools import combinations
from sklearn.metrics import explained_variance_score, r2_score, median_absolute_error

print('The scikit-learn version is {}.'.format(sklearn.__version__))
print('The pandas version is {}.'.format(pd.__version__))
print('The numpy version is {}.'.format(np.__version__))


The scikit-learn version is 0.18.1.
The pandas version is 0.19.2.
The numpy version is 1.12.0.

Read the CSV

We use pandas read_csv(path/to/csv) method to read the csv file. Next, replace the missing values with np.NaN i.e. Not a Number. This way we can count the number of missing values per column.

The dataset as described on UC Irvine repo. (125 predictive, 4 non-predictive, 18 potential goal)

We remove the features which are final goals and some other irrelevant features. For example the following attribute is to be predicted. murdPerPop: number of murders per 100K population (numeric - decimal) potential GOAL attribute (to be predicted)


In [24]:
df = pd.read_csv('../datasets/UCIrvineCrimeData.csv');
df = df.replace('?',np.NAN)
features = [x for x in df.columns if x not in ['fold', 'state', 'community', 'communityname', 'county'
                                               ,'ViolentCrimesPerPop']]

Find the number of missing values in every column


In [25]:
df.isnull().sum()


Out[25]:
state                      0
county                  1174
community               1177
communityname              0
fold                       0
population                 0
householdsize              0
racepctblack               0
racePctWhite               0
racePctAsian               0
racePctHisp                0
agePct12t21                0
agePct12t29                0
agePct16t24                0
agePct65up                 0
numbUrban                  0
pctUrban                   0
medIncome                  0
pctWWage                   0
pctWFarmSelf               0
pctWInvInc                 0
pctWSocSec                 0
pctWPubAsst                0
pctWRetire                 0
medFamInc                  0
perCapInc                  0
whitePerCap                0
blackPerCap                0
indianPerCap               0
AsianPerCap                0
                        ... 
PctSameHouse85             0
PctSameCity85              0
PctSameState85             0
LemasSwornFT            1675
LemasSwFTPerPop         1675
LemasSwFTFieldOps       1675
LemasSwFTFieldPerPop    1675
LemasTotalReq           1675
LemasTotReqPerPop       1675
PolicReqPerOffic        1675
PolicPerPop             1675
RacialMatchCommPol      1675
PctPolicWhite           1675
PctPolicBlack           1675
PctPolicHisp            1675
PctPolicAsian           1675
PctPolicMinor           1675
OfficAssgnDrugUnits     1675
NumKindsDrugsSeiz       1675
PolicAveOTWorked        1675
LandArea                   0
PopDens                    0
PctUsePubTrans             0
PolicCars               1675
PolicOperBudg           1675
LemasPctPolicOnPatr     1675
LemasGangUnitDeploy     1675
LemasPctOfficDrugUn        0
PolicBudgPerPop         1675
ViolentCrimesPerPop        0
dtype: int64

Eliminating samples or features with missing values

One of the easiest ways to deal with missing values is to simply remove the corresponding features(columns) or samples(rows) from the dataset entirely. Rows with missing values can be easily dropped via the dropna method.


In [5]:
df.dropna()


Out[5]:
state county community communityname fold population householdsize racepctblack racePctWhite racePctAsian ... LandArea PopDens PctUsePubTrans PolicCars PolicOperBudg LemasPctPolicOnPatr LemasGangUnitDeploy LemasPctOfficDrugUn PolicBudgPerPop ViolentCrimesPerPop
16 36 1 1000 Albanycity 1 0.15 0.31 0.40 0.63 0.14 ... 0.06 0.39 0.84 0.06 0.06 0.91 0.5 0.88 0.26 0.49
23 19 193 93926 SiouxCitycity 1 0.11 0.43 0.04 0.89 0.09 ... 0.16 0.12 0.07 0.04 0.01 0.81 1 0.56 0.09 0.63
33 51 680 47672 Lynchburgcity 1 0.09 0.43 0.51 0.58 0.04 ... 0.14 0.11 0.19 0.05 0.01 0.75 0 0.60 0.1 0.31
68 34 23 58200 PerthAmboycity 1 0.05 0.59 0.23 0.39 0.09 ... 0.01 0.73 0.28 0 0.02 0.64 0 1.00 0.23 0.50
74 9 9 46520 Meridentown 1 0.08 0.39 0.08 0.85 0.04 ... 0.07 0.21 0.04 0.02 0.01 0.7 1 0.44 0.11 0.14
80 36 119 84000 Yonkerscity 1 0.29 0.42 0.27 0.64 0.18 ... 0.05 0.87 1.00 0.21 0.14 0.69 0.5 0.61 0.34 0.30
95 25 17 37875 Maldencity 1 0.07 0.34 0.08 0.84 0.32 ... 0.01 0.89 1.00 0 0.01 0.88 0 0.35 0.09 0.37
99 33 11 45140 Manchestercity 1 0.14 0.34 0.02 0.96 0.07 ... 0.09 0.25 0.08 0.01 0.02 0.79 0 0.44 0.1 0.09
134 34 31 13690 Cliftoncity 1 0.10 0.34 0.03 0.89 0.21 ... 0.03 0.53 0.32 0.05 0.02 0.75 0 1.00 0.12 0.08
145 25 17 37000 Lowellcity 1 0.15 0.51 0.05 0.71 0.68 ... 0.04 0.63 0.16 0 0.02 0.68 0 1.00 0.12 0.53
155 34 31 56550 Passaiccity 1 0.08 0.67 0.40 0.16 0.43 ... 0.01 1.00 0.70 0.06 0.02 0.86 0 0.70 0.18 0.44
174 25 9 37490 Lynncity 1 0.11 0.40 0.16 0.74 0.23 ... 0.03 0.63 0.50 0.04 0.02 0.86 1 0.41 0.15 0.73
175 34 23 82000 Woodbridgetownship 1 0.13 0.51 0.13 0.80 0.34 ... 0.07 0.34 0.39 0.02 0.02 0.84 0 0.36 0.1 0.17
177 34 17 36000 JerseyCitycity 1 0.35 0.50 0.58 0.21 0.70 ... 0.04 1.00 1.00 0.29 0.19 0.49 0 0.55 0.36 0.83
222 42 71 41216 Lancastercity 2 0.07 0.42 0.24 0.56 0.12 ... 0.02 0.63 0.31 0.02 0.01 0.87 0 0.00 0.14 0.36
227 44 7 54640 Pawtucketcity 2 0.10 0.33 0.07 0.84 0.04 ... 0.02 0.70 0.18 0.06 0.01 0.84 0.5 1.00 0.1 0.19
233 9 9 52070 NewHaventown 2 0.19 0.44 0.70 0.29 0.15 ... 0.05 0.58 0.53 0.09 0.05 0.76 0.5 0.32 0.21 0.88
262 51 760 67000 Richmondcity 2 0.31 0.30 1.00 0.13 0.05 ... 0.17 0.28 0.72 0.28 0.1 0.79 0.5 0.48 0.22 0.67
277 9 1 74190 Stratfordtown 2 0.06 0.39 0.15 0.85 0.05 ... 0.05 0.23 0.11 0.01 0.01 0.93 0 0.51 0.13 0.08
293 42 77 2000 Allentowncity 2 0.15 0.34 0.10 0.79 0.08 ... 0.05 0.50 0.19 0.04 0.02 0.86 0 0.72 0.08 0.28
312 34 17 3580 Bayonnecity 2 0.08 0.32 0.09 0.86 0.11 ... 0.01 0.91 0.96 0.04 0.03 0.79 0 0.49 0.23 0.20
313 39 151 12000 Cantoncity 2 0.12 0.37 0.35 0.71 0.02 ... 0.06 0.35 0.14 0.07 0.02 0.66 0 0.93 0.12 0.51
315 36 63 51055 NiagaraFallscity 2 0.08 0.30 0.30 0.73 0.01 ... 0.04 0.37 0.17 0.04 0.02 0.62 0 0.66 0.16 0.30
338 9 9 82870 WestHaventown 2 0.07 0.38 0.24 0.76 0.12 ... 0.03 0.42 0.19 0.02 0.01 0.93 0 0.58 0.15 0.12
345 55 9 31000 GreenBaycity 2 0.14 0.36 0.01 0.92 0.14 ... 0.13 0.18 0.12 0.01 0.02 0.36 1 0.80 0.1 0.18
366 42 49 24000 Eriecity 2 0.16 0.40 0.23 0.79 0.03 ... 0.06 0.41 0.21 0.06 0.01 0.76 0 0.43 0.06 0.32
380 42 11 63624 Readingcity 2 0.11 0.36 0.19 0.68 0.09 ... 0.03 0.67 0.41 0.04 0.02 0.67 1 0.65 0.14 0.49
384 34 39 21000 Elizabethcity 2 0.16 0.52 0.39 0.47 0.17 ... 0.03 0.75 0.59 0.08 0.05 0.84 1 0.76 0.22 0.54
401 55 79 53000 Milwaukeecity 3 0.99 0.42 0.59 0.44 0.11 ... 0.28 0.55 0.62 0.37 0.38 0.57 0.5 0.24 0.25 0.40
413 24 510 4000 Baltimorecity 3 1.00 0.44 1.00 0.07 0.06 ... 0.23 0.76 1.00 0.67 0.58 0.74 0 0.36 0.34 1.00
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1441 36 55 63000 Rochestercity 8 0.36 0.34 0.61 0.41 0.11 ... 0.10 0.54 0.60 0.3 0.18 0.73 1 0.56 0.34 0.48
1445 34 13 47500 Montclairtownship 8 0.04 0.41 0.60 0.47 0.14 ... 0.02 0.50 1.00 0 0.01 0.91 0 0.42 0.15 0.21
1476 42 69 69000 Scrantoncity 8 0.12 0.36 0.03 0.96 0.05 ... 0.07 0.27 0.17 0.04 0.01 0.98 0 0.59 0.07 0.12
1494 34 17 79610 WestNewYorktown 8 0.05 0.43 0.08 0.65 0.12 ... 0.00 1.00 1.00 0 0.01 0.77 0 0.51 0.18 0.23
1511 34 17 32250 Hobokencity 8 0.04 0.21 0.11 0.68 0.27 ... 0.00 1.00 1.00 0.02 0.02 0.7 0 1.00 0.32 0.32
1512 36 67 73000 Syracusecity 8 0.25 0.37 0.40 0.62 0.13 ... 0.07 0.55 0.61 0.14 0.1 0.45 0 0.78 0.27 0.35
1544 27 53 43000 Minneapoliscity 8 0.58 0.25 0.25 0.67 0.26 ... 0.16 0.56 0.92 0.31 0.15 0.09 0.5 0.22 0.17 0.75
1555 25 17 39835 Medfordcity 8 0.08 0.43 0.08 0.90 0.12 ... 0.02 0.59 0.94 0 0.01 0.34 0 0.52 0.15 0.32
1575 51 550 16000 Chesapeakecity 8 0.23 0.58 0.53 0.55 0.08 ... 0.99 0.04 0.06 0.24 0.02 0.71 1 0.47 0.05 0.18
1594 34 29 38550 Lakewoodtownship 8 0.06 0.49 0.27 0.69 0.09 ... 0.07 0.15 0.22 0.02 0.02 0.78 1 0.66 0.23 0.24
1626 9 3 50440 NewBritaintown 9 0.11 0.36 0.15 0.72 0.11 ... 0.04 0.47 0.17 0.03 0.02 0.62 0 0.72 0.18 0.30
1633 36 29 11000 Buffalocity 9 0.51 0.31 0.60 0.46 0.06 ... 0.12 0.68 0.75 0.25 0.32 0.86 0.5 0.33 0.44 0.80
1640 9 1 8070 Bridgeporttown 9 0.21 0.47 0.52 0.37 0.14 ... 0.04 0.74 0.36 0.1 0.06 0 0 0.63 0.21 0.79
1643 33 11 50260 Nashuacity 9 0.11 0.40 0.03 0.93 0.12 ... 0.09 0.21 0.06 0.02 0.03 0.69 1 0.52 0.16 0.03
1645 50 7 10675 Burlingtoncity 9 0.05 0.45 0.02 0.96 0.09 ... 0.03 0.31 0.19 0.01 0.01 0.8 0 0.23 0.22 0.07
1716 51 810 82000 VirginiaBeachcity 9 0.62 0.57 0.27 0.71 0.26 ... 0.72 0.13 0.04 0.4 0.12 0.8 1 0.81 0.1 0.12
1751 51 700 56000 NewportNewscity 9 0.26 0.44 0.65 0.43 0.14 ... 0.20 0.21 0.16 0.2 0.04 0.76 1 0.82 0.1 0.35
1777 51 710 57000 Norfolkcity 9 0.40 0.58 0.76 0.34 0.16 ... 0.15 0.41 0.31 0.18 0.08 0.75 0.5 0.79 0.14 0.47
1810 34 23 51210 NewBrunswickcity 10 0.05 0.77 0.58 0.35 0.24 ... 0.01 0.67 0.69 0.03 0.02 0.81 0 0.50 0.23 0.62
1835 34 31 57000 Patersoncity 10 0.21 0.73 0.70 0.10 0.09 ... 0.02 1.00 0.59 0.1 0.04 0.75 0 1.00 0.13 0.65
1839 9 3 22630 EastHartfordtown 10 0.06 0.35 0.16 0.80 0.13 ... 0.05 0.23 0.33 0.02 0.02 0.79 0 0.65 0.2 0.16
1844 34 23 19000 EastBrunswicktownship 10 0.05 0.58 0.04 0.82 0.56 ... 0.06 0.16 0.56 0.04 0.01 0.77 0 0.23 0.14 0.06
1847 34 13 51000 Newarkcity 10 0.43 0.62 1.00 0.00 0.07 ... 0.07 0.97 1.00 0.02 0.03 0.8 0 0.00 0.05 1.00
1859 55 101 66000 Racinecity 10 0.12 0.44 0.36 0.64 0.03 ... 0.04 0.46 0.19 0.03 0.03 0.69 0.5 0.61 0.19 0.34
1878 25 9 34550 Lawrencecity 10 0.10 0.56 0.12 0.47 0.12 ... 0.02 0.84 0.15 0 0.02 0.54 0 0.93 0.14 0.88
1880 34 39 40350 Lindencity 10 0.04 0.39 0.39 0.65 0.09 ... 0.03 0.28 0.32 0.02 0.01 0.85 0 0.99 0.19 0.22
1963 36 27 59641 Poughkeepsiecity 10 0.03 0.32 0.61 0.47 0.09 ... 0.01 0.47 0.42 0.07 0.08 0.49 0 0.37 1 0.45
1981 9 9 35650 Hamdentown 10 0.07 0.38 0.17 0.84 0.11 ... 0.09 0.13 0.17 0.02 0.01 0.72 0 0.62 0.15 0.07
1991 9 9 80070 Waterburytown 10 0.16 0.37 0.25 0.69 0.04 ... 0.08 0.32 0.18 0.08 0.06 0.78 0 0.91 0.28 0.23
1992 25 17 72600 Walthamcity 10 0.08 0.51 0.06 0.87 0.22 ... 0.03 0.38 0.33 0.02 0.02 0.79 0 0.22 0.18 0.19

123 rows × 128 columns

Similarly, we can drop columns that have atleast one NaN in any row by setting the axis argument to 1:


In [6]:
df.dropna(axis=1);

The dropna() method supports additional parameters that can come in handy.


In [7]:
#only drop rows where all columns are null
df.dropna(how='all');

In [8]:
# drop rows that have not at least 4 non-NaN values
df.dropna(thresh=4);

In [9]:
# only drop rows where NaN appear in specific columns (here :'community')
df.dropna(subset=['community']);

Imputing missing values

Often, the removal of samples or dropping of entire feature columns is simply not feasible, because we might lost too much valuable data. In this case, we can use different interpolation techniques to estimate the missing values from the othere training samples in our dataset. One of the most common interpolation technique is mean interpolation, where we simply replace the missing value by the mean value of the entire feature column. A convenient way to achieve this is using the Imputer class from the scikit-learn as shown in the following code.


In [10]:
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(df[features])
imputed_data = imr.transform(df[features]);

Sklearn fundamentals

A convenient way to randomly partition the dataset into a separate test & training dataset is to use the train_test_split function from scikit-learn's cross_validation submodule. As of now, the target variable is ''


In [11]:
#df = df.drop(["communityname", "state", "county", "community"], axis=1)
X, y = imputed_data, df['ViolentCrimesPerPop']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0);

First, we assigned the NumPy array representation of features columns to the variable X, and we assigned the predicted variable to the variable y. Then we used the train_test_split function to randomly split X and y into separate training & test datasets. By setting test_size=0.3 we assigned 30 percent of samples to X_test and the remaining 70 percent to X_train.

Sequential Feature Selection algorithm : Sequential Backward Algorithm(SBS)

Sequential feature selection algorithms are a family of greedy search algorithms that can reduce an initial d-dimensional feature space into a k-dimensional feature subspace where k < d. The idea is to select the most relevant subset of features to improve computational efficieny and reduce generalization error


In [12]:
class SBS():
    def __init__(self, estimator, features, 
                 scoring=r2_score, test_size=0.25,
                random_state=1):
        self.scoring = scoring
        self.estimator = estimator
        self.features = features
        self.test_size = test_size
        self.random_state = random_state
    
    def fit(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(X, 
                                                           y, 
                                                           test_size = self.test_size,
                                                           random_state = self.random_state)
        dim = X_train.shape[1]
        self.indices_ = tuple(range(dim))
        self.subsets_ = [self.indices_]
        score = self._calc_score(X_train, y_train, X_test, y_test, self.indices_)
        self.scores_ = [score]
        
        while dim > self.features:
            scores = []
            subsets = []
            for p in combinations(self.indices_, r=dim-1):
                score = self._calc_score(X_train, y_train, X_test, y_test, p)
                scores.append(score)
                subsets.append(p)
            best = np.argmax(score)
            self.indices_ = subsets[best]
            self.subsets_.append(self.indices_)
            dim -= 1
            self.scores_.append(scores[best])
        self.k_score_ = self.scores_[-1]
        return self
    
    def transform(self, X):
        return X[:, self.indices_]
    
    def _calc_score(self, X_train, y_train, X_test, y_test, indices):
        self.estimator.fit(X_train[:, indices], y_train)
        y_pred = self.estimator.predict(X_test[:, indices])
        score = self.scoring(y_test, y_pred)
        return score

In the preceding implementation, we define k_features to be the final number of features when the algorithm should stop. Inside the while loop of the fit method, the feature subsets created by the itertools.combination function are evaluated and reduced until the feature subset has the desired dimensionality. In each iteration, the accuracy score of the best subset is collected in a list self.scores_ based on the internally created test dataset X_test. We will use thoese scores to evaluate the results.

Let's see our implementation using the LinearRegression


In [13]:
clf = LinearRegression() #This can also be replaced by Lasso() or Ridge()
sbs = SBS(clf, features=1)
sbs.fit(X_train, y_train)


Out[13]:
<__main__.SBS instance at 0x7efff18d02d8>

In [28]:
k_feat = [len(k) for k in sbs.subsets_]
plt.plot(k_feat, sbs.scores_, marker='o')
plt.ylim([-2, 3])
plt.ylabel('Accuracy')
plt.xlabel('Number of Features')
plt.grid()
plt.show()


I have used r2_score as a measurement of accuracy. R^2 (coefficient of determination) regression score function. Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.

Also, please note that the accuracy scores are reported on the validation set.

Let us see which 100 features give us the best score of 0.75 on the validation set


In [29]:
k100 = sbs.subsets_[23]

In [30]:
features100 = [features[x] for x in k100]
print features100


['population', 'householdsize', 'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp', 'agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', 'numbUrban', 'pctUrban', 'medIncome', 'pctWWage', 'pctWFarmSelf', 'pctWInvInc', 'pctWSocSec', 'pctWPubAsst', 'pctWRetire', 'medFamInc', 'perCapInc', 'whitePerCap', 'blackPerCap', 'indianPerCap', 'AsianPerCap', 'OtherPerCap', 'HispPerCap', 'NumUnderPov', 'PctPopUnderPov', 'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore', 'PctUnemployed', 'PctEmploy', 'PctEmplManu', 'PctEmplProfServ', 'PctOccupManu', 'PctOccupMgmtProf', 'MalePctDivorce', 'MalePctNevMarr', 'FemalePctDiv', 'TotalPctDiv', 'PersPerFam', 'PctFam2Par', 'PctKids2Par', 'PctYoungKids2Par', 'PctTeen2Par', 'PctWorkMomYoungKids', 'PctWorkMom', 'NumIlleg', 'PctIlleg', 'NumImmig', 'PctImmigRecent', 'PctImmigRec5', 'PctImmigRec8', 'PctImmigRec10', 'PctRecentImmig', 'PctRecImmig5', 'PctRecImmig8', 'PctRecImmig10', 'PctSpeakEnglOnly', 'PctNotSpeakEnglWell', 'PctLargHouseFam', 'PctLargHouseOccup', 'PersPerOccupHous', 'PersPerOwnOccHous', 'PersPerRentOccHous', 'PctPersOwnOccup', 'PctPersDenseHous', 'PctHousLess3BR', 'MedNumBR', 'HousVacant', 'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos', 'MedYrHousBuilt', 'PctHousNoPhone', 'PctWOFullPlumb', 'OwnOccLowQuart', 'OwnOccMedVal', 'OwnOccHiQuart', 'RentLowQ', 'RentMedian', 'RentHighQ', 'MedRent', 'MedRentPctHousInc', 'MedOwnCostPctInc', 'MedOwnCostPctIncNoMtg', 'NumInShelters', 'NumStreet', 'PctForeignBorn', 'PctBornSameState', 'PctSameHouse85', 'PctSameCity85', 'PctSameState85', 'LemasSwornFT', 'LemasSwFTPerPop', 'LemasSwFTFieldOps']

In [45]:
plt.figure(figsize=(20, 6))
classifiers = [LinearRegression(), Lasso(), Ridge()]
names = ["Linear Regression", "Lasso", "Ridge"]
for i, classifier, name in zip(range(len(classifiers)), classifiers, names):
    ax = plt.subplot(1, len(classifiers), i+1)
    ax.set_ylim([-1, 1])
    plt.setp(ax.get_yticklabels(), visible=True)
    plt.setp(ax.get_xticklabels(), visible=True)
    sbs = SBS(classifier, features=1)
    sbs.fit(X_train, y_train)
    k_feat = [len(k) for k in sbs.subsets_]
    plt.plot(k_feat, sbs.scores_, marker='o', label="Accuracy")
    plt.ylabel('Accuracy')
    plt.xlabel('Number of Features')
    plt.grid()
    plt.legend(loc="best")
    plt.title("For {} min-score = {:.2e} & max-score={:.2e}".format(name, min(sbs.scores_), max(sbs.scores_)))
plt.show()



In [41]:



Out[41]:
0.15187618504340605

In [ ]: