In [1]:
%pylab inline
pylab.style.use('ggplot')
import numpy as np
import pandas as pd
In [2]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt'
data_df = pd.read_csv(url, header=None)
In [3]:
data_df.head()
Out[3]:
In [4]:
data_df.columns = ['variance', 'skewness', 'kurtosis', 'entropy', 'type']
In [5]:
data_df.head()
Out[5]:
In [6]:
data_df['type'] = data_df['type'].map(lambda x: 'fake' if x == 0 else 'real')
In [7]:
data_df.head()
Out[7]:
In [11]:
data_df.type.value_counts().plot(kind='bar')
Out[11]:
In [12]:
f_corrs = data_df.drop('type', axis=1).corr()
import seaborn as sns
sns.heatmap(f_corrs, annot=True)
Out[12]:
In [14]:
data_df.groupby(by='type').describe().T
Out[14]:
In [17]:
import statsmodels.formula.api as sm
model = sm.mnlogit(formula='type ~ kurtosis', data=data_df)
result = model.fit_regularized()
result.summary()
Out[17]:
In [19]:
model = sm.mnlogit(formula='type ~ kurtosis + skewness', data=data_df)
result = model.fit_regularized()
result.summary()
Out[19]:
In [20]:
model = sm.mnlogit(formula='type ~ kurtosis + skewness + variance', data=data_df)
result = model.fit_regularized()
result.summary()
Out[20]:
In [22]:
model = sm.mnlogit(formula='type ~ kurtosis + skewness + variance', data=data_df)
result = model.fit_regularized()
result.summary()
Out[22]:
In [27]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
fold = StratifiedKFold(n_splits=5, shuffle=True)
In [51]:
cv_results = []
labels = ['real', 'fake']
for train_idx, test_idx in fold.split(data_df['type'], data_df['type']):
model = sm.mnlogit(formula='type ~ kurtosis + skewness + variance', data=data_df.iloc[train_idx])
result = model.fit_regularized()
test_df = data_df.iloc[test_idx].drop('type', axis=1)
predicted = model.predict(params=result.params, exog=test_df)
predicted_df = pd.DataFrame(predicted, index=test_df.index)
predicted_result = predicted_df.apply(lambda v: labels[np.argmax(v)], axis=1)
cv_result = pd.concat([predicted_result, data_df['type'].iloc[test_idx]], axis=1, keys=['predicted', 'actual'])
cv_results.append(cv_result)
In [52]:
from sklearn.metrics import confusion_matrix
for r in cv_results:
pylab.figure()
c = confusion_matrix(r['actual'], r['predicted'])
sns.heatmap(c, annot=True)
In [53]:
f1_scores = [f1_score(r['actual'], r['predicted'], average='macro') for r in cv_results]
In [54]:
pd.Series(f1_scores).plot(kind='bar')
Out[54]:
In [57]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
In [60]:
estimator = GaussianNB()
features_df = data_df.drop('type', axis=1)
labels = data_df['type']
scores = cross_val_score(estimator, features_df, labels, cv=5, scoring='f1_macro')
In [61]:
pd.Series(scores).plot(kind='bar')
Out[61]:
In [64]:
estimator = GaussianNB()
features_df = data_df.drop(['type', 'entropy'], axis=1)
labels = data_df['type']
scores = cross_val_score(estimator, features_df, labels, cv=5, scoring='f1_macro')
In [65]:
pd.Series(scores).plot(kind='bar')
Out[65]: