In [1]:
import numpy as np
import pandas as pd
%pylab inline
pylab.style.use('ggplot')
In [2]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/ecoli/ecoli.data'
data_df = pd.read_csv(url, header=None, delim_whitespace=True)
Attribute Information
In [3]:
columns = ['seq_name', 'mcg', 'gvh', 'lip','chg', 'aac', 'alm1', 'alm2', 'type']
In [4]:
data_df.head()
Out[4]:
In [5]:
data_df.columns = columns
In [6]:
data_df.head()
Out[6]:
In [7]:
data_df['type'].value_counts().plot(kind='bar')
Out[7]:
In [8]:
from sklearn.feature_selection import f_classif, chi2
f_test_statistic, f_p_val = f_classif(data_df.drop(['seq_name', 'type'], axis=1), data_df['type'])
In [9]:
f_test_series = pd.Series(f_test_statistic, index=[c for c in data_df.columns if c not in ['seq_name', 'type']])
In [10]:
f_test_series.sort_values(ascending=True).plot(kind='bar')
Out[10]:
In [11]:
f_test_p_vals = pd.Series(f_p_val, index=[c for c in data_df.columns if c not in ['seq_name', 'type']])
f_test_p_vals.sort_values(ascending=True)
Out[11]:
In [12]:
avg_aac = data_df.groupby(by='type')['aac'].describe().loc[:, 'mean', :]
avg_aac.plot(kind='barh', title='Average AAC values per feature')
Out[12]:
In [13]:
avg_alm1 = data_df.groupby(by='type')['alm1'].describe().loc[:, 'mean', :]
avg_alm1.plot(kind='barh', title='Average ALM1 values per feature')
Out[13]:
In [21]:
# pd.crosstab(data_df['seq_name'], data_df['type'])
In [15]:
import seaborn as sns
corrs = data_df.drop(['type', 'seq_name'], axis=1).corr()
sns.heatmap(corrs, annot=True)
Out[15]:
In [16]:
# Oversample the feature counts
feature_counts = data_df['type'].value_counts()
max_counts = feature_counts.max()
max_counts
Out[16]:
In [17]:
unique_labels = feature_counts.index
indexed_by_labels = data_df.set_index('type')
sampled = [indexed_by_labels.loc[label].sample(max_counts, replace=True) for label in unique_labels]
oversampled_data_df = pd.concat(sampled, axis=0).reset_index()
In [18]:
oversampled_data_df.head()
Out[18]:
In [19]:
oversampled_data_df['type'].value_counts().plot(kind='barh')
Out[19]:
In [20]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
estimator = GaussianNB()
features = oversampled_data_df.drop(['type', 'seq_name'], axis=1)
labels = oversampled_data_df['type']
fold = StratifiedKFold(n_splits=5, shuffle=True)
scores = cross_val_score(estimator, features, labels, cv=fold)
pd.Series(scores).plot(kind='bar')
Out[20]: