In [ ]:
%pylab inline
In [ ]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from scipy import stats, optimize
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.base import clone
from itertools import combinations
from sklearn.metrics import explained_variance_score, r2_score, median_absolute_error
print('The scikit-learn version is {}.'.format(sklearn.__version__))
print('The pandas version is {}.'.format(pd.__version__))
print('The numpy version is {}.'.format(np.__version__))
In [ ]:
df = pd.read_csv('../datasets/UCIrvineCrimeData.csv');
df = df.replace('?',np.NAN)
features = [x for x in df.columns if x not in ['state', 'community', 'communityname', 'county'
, 'ViolentCrimesPerPop']]
In [ ]:
df.isnull().sum()
In [ ]:
df.dropna()
Similarly, we can drop columns that have atleast one NaN
in any row by setting the axis argument to 1:
In [ ]:
df.dropna(axis=1);
The dropna()
method supports additional parameters that can come in handy.
In [ ]:
#only drop rows where all columns are null
df.dropna(how='all');
In [ ]:
# drop rows that have not at least 4 non-NaN values
df.dropna(thresh=4);
In [ ]:
# only drop rows where NaN appear in specific columns (here :'community')
df.dropna(subset=['community']);
Often, the removal of samples or dropping of entire feature columns is simply not feasible, because we might lost too much valuable data. In this case, we can use different interpolation techniques to estimate the missing values from the othere training samples in our dataset. One of the most common interpolation technique is mean interpolation, where we simply replace the missing value by the mean value of the entire feature column. A convenient way to achieve this is using the Imputer
class from the scikit-learn
as shown in the following code.
In [ ]:
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(df[features])
imputed_data = imr.transform(df[features]);
A convenient way to randomly partition the dataset into a separate test & training dataset is to use the train_test_split
function from scikit-learn's
cross_validation
submodule
In [ ]:
#df = df.drop(["communityname", "state", "county", "community"], axis=1)
X, y = imputed_data, df['ViolentCrimesPerPop']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0);
First, we assigned the NumPy array representation of features columns to the variable X, and we assigned the predicted variable to the variable y
. Then we used the train_test_split
function to randomly split X
and y
into separate training & test datasets. By setting test_size=0.3
we assigned 30 percent of samples to X_test and the remaining 70 percent to X_train.
Sequential feature selection algorithms are a family of greedy search algorithms that can reduce an initial d-dimensional feature space into a k-dimensional feature subspace where k < d. The idea is to select the most relevant subset of features to improve computational efficieny and reduce generalization error
In [ ]:
class SBS():
def __init__(self, estimator, features,
scoring=r2_score, test_size=0.25,
random_state=1):
self.scoring = scoring
self.estimator = estimator
self.features = features
self.test_size = test_size
self.random_state = random_state
def fit(self, X, y):
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size = self.test_size,
random_state = self.random_state)
dim = X_train.shape[1]
self.indices_ = tuple(range(dim))
self.subsets_ = [self.indices_]
score = self._calc_score(X_train, y_train, X_test, y_test, self.indices_)
self.scores_ = [score]
while dim > self.features:
scores = []
subsets = []
for p in combinations(self.indices_, r=dim-1):
score = self._calc_score(X_train, y_train, X_test, y_test, p)
scores.append(score)
subsets.append(p)
best = np.argmax(score)
self.indices_ = subsets[best]
self.subsets_.append(self.indices_)
dim -= 1
self.scores_.append(scores[best])
print self.scores_
self.k_score_ = self.scores_[-1]
return self
def transform(self, X):
return X[:, self.indices_]
def _calc_score(self, X_train, y_train, X_test, y_test, indices):
self.estimator.fit(X_train[:, indices], y_train)
y_pred = self.estimator.predict(X_test[:, indices])
score = self.scoring(y_test, y_pred)
return score
In [ ]:
clf = LinearRegression()
sbs = SBS(clf, features=1)
sbs.fit(X_train, y_train)
In [ ]:
k_feat = [len(k) for k in sbs.subsets_]
plt.plot(k_feat, sbs.scores_, marker='o')
plt.ylim([-1, 1])
plt.ylabel('Accuracy')
plt.xlabel('Number of Features')
plt.grid()
plt.show()
In [ ]: