In [ ]:
%pylab inline

In [ ]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

from scipy import stats, optimize
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Lasso, LinearRegression, Ridge

from sklearn.base import clone
from itertools import combinations
from sklearn.metrics import explained_variance_score, r2_score, median_absolute_error

print('The scikit-learn version is {}.'.format(sklearn.__version__))
print('The pandas version is {}.'.format(pd.__version__))
print('The numpy version is {}.'.format(np.__version__))

Read the CSV

We use pandas read_csv(path/to/csv) method to read the csv file. Next, replace the missing values with np.NaN i.e. Not a Number. This way we can count the number of missing values per column.


In [ ]:
df = pd.read_csv('../datasets/UCIrvineCrimeData.csv');
df = df.replace('?',np.NAN)
features = [x for x in df.columns if x not in ['state', 'community', 'communityname', 'county'
                                               , 'ViolentCrimesPerPop']]

Find the number of missing values in every column


In [ ]:
df.isnull().sum()

Eliminating samples or features with missing values

One of the easiest ways to deal with missing values is to simply remove the corresponding features(columns) or samples(rows) from the dataset entirely. Rows with missing values can be easily dropped via the dropna method.


In [ ]:
df.dropna()

Similarly, we can drop columns that have atleast one NaN in any row by setting the axis argument to 1:


In [ ]:
df.dropna(axis=1);

The dropna() method supports additional parameters that can come in handy.


In [ ]:
#only drop rows where all columns are null
df.dropna(how='all');

In [ ]:
# drop rows that have not at least 4 non-NaN values
df.dropna(thresh=4);

In [ ]:
# only drop rows where NaN appear in specific columns (here :'community')
df.dropna(subset=['community']);

Imputing missing values

Often, the removal of samples or dropping of entire feature columns is simply not feasible, because we might lost too much valuable data. In this case, we can use different interpolation techniques to estimate the missing values from the othere training samples in our dataset. One of the most common interpolation technique is mean interpolation, where we simply replace the missing value by the mean value of the entire feature column. A convenient way to achieve this is using the Imputer class from the scikit-learn as shown in the following code.


In [ ]:
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(df[features])
imputed_data = imr.transform(df[features]);

Sklearn fundamentals

A convenient way to randomly partition the dataset into a separate test & training dataset is to use the train_test_split function from scikit-learn's cross_validation submodule


In [ ]:
#df = df.drop(["communityname", "state", "county", "community"], axis=1)
X, y = imputed_data, df['ViolentCrimesPerPop']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0);

First, we assigned the NumPy array representation of features columns to the variable X, and we assigned the predicted variable to the variable y. Then we used the train_test_split function to randomly split X and y into separate training & test datasets. By setting test_size=0.3 we assigned 30 percent of samples to X_test and the remaining 70 percent to X_train.

Sequential Feature Selection algorithm : Sequential Backward Algorithm(SBS)

Sequential feature selection algorithms are a family of greedy search algorithms that can reduce an initial d-dimensional feature space into a k-dimensional feature subspace where k < d. The idea is to select the most relevant subset of features to improve computational efficieny and reduce generalization error


In [ ]:
class SBS():
    def __init__(self, estimator, features, 
                 scoring=r2_score, test_size=0.25,
                random_state=1):
        self.scoring = scoring
        self.estimator = estimator
        self.features = features
        self.test_size = test_size
        self.random_state = random_state
    
    def fit(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(X, 
                                                           y, 
                                                           test_size = self.test_size,
                                                           random_state = self.random_state)
        dim = X_train.shape[1]
        self.indices_ = tuple(range(dim))
        self.subsets_ = [self.indices_]
        score = self._calc_score(X_train, y_train, X_test, y_test, self.indices_)
        self.scores_ = [score]
        
        while dim > self.features:
            scores = []
            subsets = []
            for p in combinations(self.indices_, r=dim-1):
                score = self._calc_score(X_train, y_train, X_test, y_test, p)
                scores.append(score)
                subsets.append(p)
            best = np.argmax(score)
            self.indices_ = subsets[best]
            self.subsets_.append(self.indices_)
            dim -= 1
            self.scores_.append(scores[best])
            print self.scores_
        self.k_score_ = self.scores_[-1]
        return self
    
    def transform(self, X):
        return X[:, self.indices_]
    
    def _calc_score(self, X_train, y_train, X_test, y_test, indices):
        self.estimator.fit(X_train[:, indices], y_train)
        y_pred = self.estimator.predict(X_test[:, indices])
        score = self.scoring(y_test, y_pred)
        return score

In [ ]:
clf = LinearRegression()
sbs = SBS(clf, features=1)
sbs.fit(X_train, y_train)

In [ ]:
k_feat = [len(k) for k in sbs.subsets_]
plt.plot(k_feat, sbs.scores_, marker='o')
plt.ylim([-1, 1])
plt.ylabel('Accuracy')
plt.xlabel('Number of Features')
plt.grid()
plt.show()

In [ ]: