http://www.stat.ufl.edu/~winner/data/gold_target1.txt
Columns are:
As level 1-8
Sb level 10-16
Lineament Proximity 24 /* 1=Present, 0 if absent (0.5km) */
Gold deposit proximity 32 /* 1=Present, 0=absent (0.5km) */
In [1]:
import numpy as np
import pandas as pd
%pylab inline
pylab.style.use('ggplot')
In [56]:
url = 'http://www.stat.ufl.edu/~winner/data/gold_target1.dat'
data_df = pd.read_csv(url, sep='[\s]+', engine='python', header=None)
In [57]:
data_df.info()
In [58]:
data_df.columns = ['as_level', 'sb_level', 'l_proximity', 'gold_proximity']
In [59]:
data_df.head()
Out[59]:
In [11]:
data_df.plot(kind='scatter', x='as_level', y = 'gold_proximity')
Out[11]:
In [12]:
data_df.plot(kind='scatter', x='sb_level', y = 'gold_proximity')
Out[12]:
In [17]:
data_df['gold_proximity'].value_counts().plot(kind='bar')
Out[17]:
In [21]:
counts = data_df.groupby(['gold_proximity', 'l_proximity']).size()
counts.unstack().plot(kind='bar')
Out[21]:
In [24]:
import statsmodels.formula.api as sm
model = sm.logit(formula='gold_proximity ~ C(l_proximity)', data=data_df)
result = model.fit()
result.summary()
Out[24]:
In [45]:
from sklearn.model_selection import KFold
fold = KFold(n_splits=5, shuffle=True)
def cross_validate(train_index, test_index):
train_data, test_data = data_df.iloc[train_index], data_df.iloc[test_index]
t_model = sm.logit(formula='gold_proximity ~ C(l_proximity)', data=train_data)
t_result = t_model.fit()
t_predict = t_result.predict(test_data)
t_predict = t_predict.apply(lambda v: 1.0 if v >= 0.5 else 0.0)
return pd.concat({'predicted': t_predict, 'actual': test_data['gold_proximity']}, axis=1)
cv_results = [cross_validate(train_index, test_index) for train_index, test_index in fold.split(data_df.index)]
In [46]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
for idx, result in enumerate(cv_results):
c = confusion_matrix(result['predicted'], result['actual'])
pylab.figure()
ax = sns.heatmap(c, annot=True)
ax.set_title('Confusion Matrix for fold %s' % (idx+1))
In [48]:
from sklearn.metrics import f1_score
scores = [f1_score(r['predicted'], r['actual']) for r in cv_results]
sc = pd.Series(data=scores, name='F1_scores')
sc.plot(kind='bar', title='5-fold cross validation results')
Out[48]: