In [4]:
import pandas as pd
from io import StringIO
from sklearn.preprocessing import Imputer
In [8]:
csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''
In [9]:
df = pd.read_csv(StringIO(csv_data))
Using mean imputation: replace missing value by mean of entire feature column
In [10]:
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
#could use median or most_frequent for strategy also
In [11]:
imr = imr.fit(df)
In [12]:
imputed_data = imr.transform(df.values)
In [13]:
imputed_data
Out[13]:
In [14]:
df = pd.DataFrame([ ['green', 'M', 10.1, 'class1'], ['red', 'L', 13.5, 'class2'],
['blue', 'XL', 15.3, 'class1']])
df.columns = ['color', 'size', 'price', 'classlabel']
In [15]:
#color is nominal feature, size is ordinal, price is numerical
df
Out[15]:
In [16]:
size_mapping = { 'XL': 3, 'L':2, 'M':1}
In [17]:
df['size'] = df['size'].map(size_mapping)
#this can be undone using reverse mapping dict inv_size_mapping = {v: k for k, v in size_mapping.items()}
In [18]:
df
Out[18]:
In [19]:
import numpy as np
In [20]:
#not ordinal, doesn't matter which number we assign to string label
class_mapping = {label:idx for idx,label in enumerate(np.unique(df['classlabel']))}
In [21]:
class_mapping
Out[21]:
In [22]:
df['classlabel'] = df['classlabel'].map(class_mapping)
In [23]:
df
Out[23]:
In [24]:
#also can use scikit-learn LabelEncoder
from sklearn.preprocessing import LabelEncoder
In [25]:
class_le = LabelEncoder()
In [26]:
class_le.fit_transform(df['classlabel'].values)
Out[26]:
In [27]:
from sklearn.preprocessing import OneHotEncoder
In [29]:
X = df[['color','size','price']].values
In [31]:
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
In [32]:
X
Out[32]:
In [33]:
ohe = OneHotEncoder(categorical_features=[0])
In [34]:
ohe.fit_transform(X).toarray()
Out[34]:
In [35]:
#even easier using pandas:
pd.get_dummies(df[['price','color','size']])
Out[35]:
In [36]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
In [37]:
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium',
'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']
In [38]:
print('Class labels', np.unique(df_wine['Class label']))
In [39]:
df_wine.head()
Out[39]:
In [40]:
from sklearn.cross_validation import train_test_split
In [41]:
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
In [43]:
from sklearn.preprocessing import MinMaxScaler
In [44]:
mms = MinMaxScaler()
In [45]:
X_train_norm = mms.fit_transform(X_train)
In [46]:
X_test_norm = mms.transform(X_test)
In [47]:
from sklearn.preprocessing import StandardScaler
In [48]:
stdsc = StandardScaler()
In [49]:
X_train_std = stdsc.fit_transform(X_train)
In [50]:
X_test_std = stdsc.transform(X_test)
In [51]:
from sklearn.linear_model import LogisticRegression
In [55]:
lr = LogisticRegression(penalty='l1', C=0.1)
In [56]:
lr.fit(X_train_std, y_train)
Out[56]:
In [57]:
print('Training accuracy:', lr.score(X_train_std, y_train))
In [58]:
print('Test accuracy:', lr.score(X_test_std, y_test))
In [61]:
#plot regularization path
import matplotlib.pyplot as plt
fig = plt.figure()
ax = plt.subplot(111)
colors = ['blue','green','red','cyan','magenta','yellow','black','pink','lightgreen','lightblue','gray','indigo',
'orange']
weights, params = [], []
for c in np.arange(-4,6, dtype=float):
lr = LogisticRegression(penalty='l1', C=10**c, random_state = 0)
lr.fit(X_train_std, y_train)
weights.append(lr.coef_[1])
params.append(10**c)
weights = np.array(weights)
for column, color in zip(range(weights.shape[1]), colors):
plt.plot(params, weights[:, column], label=df_wine.columns[column+1], color=color)
plt.axhline(0, color='black', linestyle='--', linewidth=3)
plt.xlim([10**(-5), 10**5])
plt.ylabel('weight coefficient')
plt.xlabel('C')
plt.xscale('log')
plt.legend(loc='upper left')
ax.legend(loc='upper center', bbox_to_anchor=(1.38, 1.03), ncol=1, fancybox=True)
plt.show()
In [62]:
#not implemented in scikit-learn (yet?)
from sklearn.base import clone
from itertools import combinations
from sklearn.metrics import accuracy_score
In [63]:
class SBS():
def __init__(self, estimator, k_features, scoring=accuracy_score, test_size=0.25, random_state=1):
self.scoring = scoring
self.estimator = clone(estimator)
self.k_features = k_features
self.test_size = test_size
self.random_state = random_state
def fit(self, X, y):
#splitting is for testing(validation) and training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size,
random_state=self.random_state)
dim = X_train.shape[1]
self.indices_ = tuple(range(dim))
self.subsets_ = [self.indices_]
score = self._calc_score(X_train, y_train, X_test, y_test, self.indices_)
self.scores_ = [score]
while dim > self.k_features:
scores = []
subsets = []
for p in combinations(self.indices_, r=dim-1):
score = self._calc_score(X_train, y_train, X_test, y_test, p)
scores.append(score)
subsets.append(p)
best = np.argmax(scores)
self.indices_ = subsets[best]
self.subsets_.append(self.indices_)
dim -= 1
self.scores_.append(scores[best])
self.k_score_ = self.scores_[-1]
return self
def transform(self, X):
return X[:, self.indices_]
def _calc_score(self, X_train, y_train, X_test, y_test, indices):
self.estimator.fit(X_train[:, indices], y_train)
y_pred = self.estimator.predict(X_test[:, indices])
score = self.scoring(y_test, y_pred)
return score
In [64]:
from sklearn.neighbors import KNeighborsClassifier
In [65]:
knn = KNeighborsClassifier(n_neighbors=2)
In [66]:
sbs = SBS(knn, k_features=1)
In [67]:
sbs.fit(X_train_std, y_train)
Out[67]:
In [68]:
k_feat = [len(k) for k in sbs.subsets_]
In [69]:
plt.plot(k_feat, sbs.scores_, marker='o')
plt.ylim([0.7, 1.1])
plt.ylabel('Accuracy')
plt.xlabel('Number of features')
plt.grid()
plt.show()
In [70]:
k5 = list(sbs.subsets_[8])
In [71]:
print(df_wine.columns[1:][k5])
In [72]:
knn.fit(X_train_std, y_train)
Out[72]:
In [73]:
print('Training accuracy:', knn.score(X_train_std, y_train))
In [74]:
print('Test accuracy:', knn.score(X_test_std, y_test))
In [75]:
#Slight degree of overfitting ^^
#Use selected 5 feature subset and see how well KNN performs:
In [76]:
knn.fit(X_train_std[:, k5], y_train)
Out[76]:
In [77]:
print('Training accuracy:', knn.score(X_train_std[:, k5], y_train))
In [78]:
print('Test accuracy: ', knn.score(X_test_std[:, k5], y_test))
In [79]:
from sklearn.ensemble import RandomForestClassifier
In [80]:
feat_labels = df_wine.columns[1:]
In [81]:
forest = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
In [82]:
forest.fit(X_train, y_train)
Out[82]:
In [83]:
importances = forest.feature_importances_
In [84]:
indices = np.argsort(importances)[::-1]
In [85]:
for f in range(X_train.shape[1]):
print("%2d) %-*s %f" % (f+1, 30, feat_labels[indices[f]], importances[indices[f]]))
In [87]:
plt.title('Feature Importances')
plt.bar(range(X_train.shape[1]), importances[indices], color='lightblue', align='center')
plt.xticks(range(X_train.shape[1]), feat_labels[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
plt.show()
In [88]:
#Keep in mind with this technique: if 2 or more features highly correlated, one feature may be ranked very highly
#while information of other features not fully captured