In [1]:
%matplotlib inline
import diogenes
import numpy as np
Get data from wine quality data set
In [2]:
data = diogenes.read.open_csv_url(
'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv',
delimiter=';')
Note that data is a Numpy structured array We can use it like this:
In [3]:
data.dtype.names
Out[3]:
In [4]:
print data.shape
In [5]:
print data['fixed acidity']
We separate our labels from the rest of the data and turn our labels into binary classes.
In [6]:
labels = data['quality']
labels = labels < np.average(labels)
print labels
Remove the labels from the rest of our data
In [7]:
M = diogenes.modify.remove_cols(data, 'quality')
print M.dtype.names
Print summary statistics for our features
In [8]:
diogenes.display.pprint_sa(diogenes.display.describe_cols(M))
Plot correlation between features
In [9]:
fig = diogenes.display.plot_correlation_matrix(M)
Arrange an experiment trying different classifiers
In [10]:
exp = diogenes.grid_search.experiment.Experiment(
M,
labels,
clfs=diogenes.grid_search.standard_clfs.std_clfs)
Make a pdf report
In [11]:
exp.make_report(verbose=False)
Out[11]:
Find the trial with the best score and make an ROC curve
In [12]:
trials_with_score = exp.average_score()
best_trial, best_score = max(trials_with_score.iteritems(), key=lambda trial_and_score: trial_and_score[1])
print best_trial
print best_score
In [13]:
fig = best_trial.roc_curve()
In [ ]: