In [1]:
%matplotlib inline
import itertools as it
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
We will try to separate rocks from mines using this data set.
From the description provided:
Data Set Information:
The file "sonar.mines" contains 111 patterns obtained by bouncing sonar signals off a metal cylinder at various angles and under various conditions. The file "sonar.rocks" contains 97 patterns obtained from rocks under similar conditions. The transmitted sonar signal is a frequency-modulated chirp, rising in frequency. The data set contains signals obtained from a variety of different aspect angles, spanning 90 degrees for the cylinder and 180 degrees for the rock.
Each pattern is a set of 60 numbers in the range 0.0 to 1.0. Each number represents the energy within a particular frequency band, integrated over a certain period of time. The integration aperture for higher frequencies occur later in time, since these frequencies are transmitted later during the chirp.
The label associated with each record contains the letter "R" if the object is a rock and "M" if it is a mine (metal cylinder). The numbers in the labels are in increasing order of aspect angle, but they do not encode the angle directly.
In [2]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data', header=None, prefix='X')
In [3]:
df.shape
Out[3]:
In [4]:
df.rename(columns={'X60':'Label'}, inplace=True)
df.Label = df.Label.astype('category')
In [5]:
df.head()
Out[5]:
In [6]:
from pandas.tools.plotting import andrews_curves, parallel_coordinates
In [7]:
fig, axes = plt.subplots(2,1,figsize=(12,8))
andrews_curves(df, 'Label', samples=50, linewidth=0.5, ax=axes[0])
axes[0].set_xticks([])
parallel_coordinates(df, 'Label', linewidth=0.5, ax=axes[1],
axvlines_kwds={'linewidth': 0.5, 'color': 'black', 'alpha':0.5})
axes[1].set_xticks([])
axes[1].margins(0.05)
pass
In [8]:
from sklearn.manifold import MDS
In [9]:
mds = MDS(n_components=2)
In [10]:
mds_data = mds.fit_transform(df.ix[:, :-1])
In [11]:
plt.scatter(mds_data[:, 0], mds_data[:, 1], c=df.Label.cat.codes, s=50);
In [12]:
heatmap = plt.pcolor(df.corr(), cmap='jet')
plt.colorbar(heatmap)
plt.gca().set_aspect('equal')
In [13]:
df.plot.box(figsize=(12,4), xticks=[])
pass
In [14]:
df.plot.density(figsize=(6, 60), subplots=True, yticks=[])
pass
In [15]:
from sklearn.preprocessing import StandardScaler, RobustScaler
In [16]:
data, labels = df.ix[:, :-1], df.ix[:, -1]
In [17]:
data.head(3)
Out[17]:
In [18]:
data_scaled = DataFrame(StandardScaler().fit_transform(data), columns=data.columns)
In [19]:
data_scaled.head(3)
Out[19]:
In [20]:
data_robust = DataFrame(RobustScaler().fit_transform(data), columns=data.columns)
In [21]:
data_robust.head(3)
Out[21]:
In [22]:
from sklearn.decomposition import PCA
In [23]:
data.shape
Out[23]:
In [24]:
pca = PCA()
data_scaled_pca = DataFrame(pca.fit_transform(data_scaled), columns=data.columns)
In [25]:
data_scaled.shape
Out[25]:
In [26]:
v = pca.explained_variance_ratio_
vc = v.cumsum()
DataFrame(list(zip(it.count(), v, vc)), columns=['pc', 'explained', 'cumsum']).head(10)
Out[26]:
In [27]:
n_comps = 1 + np.argmax(vc > 0.95)
n_comps
Out[27]:
In [28]:
data_scaled_pca = data_scaled_pca.ix[:, :n_comps]
data_scaled_pca.shape
Out[28]:
In [29]:
data_scaled_pca.head()
Out[29]:
In [30]:
df_pca = pd.concat([data_scaled_pca, labels], axis=1)
df_pca.shape
Out[30]:
In [31]:
from sklearn.model_selection import train_test_split
In [32]:
X_train, X_test, y_train, y_test = \
train_test_split(data_scaled_pca, labels, test_size=0.33, random_state=42)
In [33]:
from sklearn.linear_model import LogisticRegression
In [34]:
lr = LogisticRegression()
In [35]:
lr.fit(X_train, y_train)
Out[35]:
In [36]:
lr.score(X_test, y_test)
Out[36]:
In [37]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
In [38]:
parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
'C': [1, 10, 100, 1000]},
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
In [39]:
clf = GridSearchCV(SVC(C=1), parameters, cv=5, scoring='accuracy', n_jobs=-1)
clf.fit(X_train, y_train.codes)
pass
In [40]:
clf.best_params_
Out[40]:
In [41]:
clf.best_score_
Out[41]:
In [42]:
clf.score(X_test, y_test.codes)
Out[42]:
In [43]:
from sklearn.metrics import classification_report
In [44]:
y_true, y_pred = y_test.codes, clf.predict(X_test)
print(classification_report(y_true, y_pred))
In [45]:
from sklearn.ensemble import RandomForestClassifier
In [46]:
X_train, X_test, y_train, y_test = \
train_test_split(data_scaled, labels, test_size=0.33, random_state=42)
In [47]:
parameters = [{'n_estimators': list(range(25, 201, 25)),
'max_features': list(range(2, 15, 2))}]
clf = GridSearchCV(RandomForestClassifier(), parameters,
cv=5, scoring='accuracy', n_jobs=-1)
clf.fit(X_train, y_train.codes)
pass
In [48]:
clf.best_params_
Out[48]:
In [49]:
clf.score(X_test, y_test.codes)
Out[49]:
In [50]:
imp = clf.best_estimator_.feature_importances_
idx = np.argsort(imp)
In [51]:
plt.figure(figsize=(6, 18))
plt.barh(range(len(imp)), imp[idx])
plt.yticks(np.arange(len(imp))+0.5, idx)
pass
In [52]:
from sklearn.pipeline import Pipeline
In [53]:
X_train, X_test, y_train, y_test = \
train_test_split(data_scaled, labels, test_size=0.33, random_state=42)
In [54]:
scaler = StandardScaler()
pca = PCA()
clf = LogisticRegression()
pipe = Pipeline(steps=[('scaler', scaler), ('pca', pca), ('clf', clf)])
n_components = [20, 30, 40, 50, 60]
Cs = np.logspace(-4, 4, 1)
estimator = GridSearchCV(pipe,
dict(pca__n_components=n_components,
clf__C=Cs), n_jobs=-1)
estimator.fit(X_train, y_train.codes)
pass
In [55]:
estimator.best_estimator_.named_steps['pca'].n_components
Out[55]:
In [56]:
estimator.score(X_test, y_test.codes)
Out[56]:
In [57]:
y_true, y_pred = y_test.codes, estimator.predict(X_test)
print(classification_report(y_true, y_pred))