In [53]:
%matplotlib inline
import os
import json
import time
import pickle
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
Here we are retrieving data from the UCI data source Wine Quality. To do this we do the following:
In [54]:
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
def fetch_data(fname='winequality-red.csv'):
"""
Helper method to retreive the ML Repository dataset.
"""
response = requests.get(URL)
outpath = os.path.abspath(fname)
with open(outpath, 'wb') as f:
f.write(response.content)
return outpath
# Defining fetching data from the URL
DATA = fetch_data()
Here we do a bit of clean up on our data set to prepare it for analysis with the following step:
In [55]:
FEATURES = [
"fixed acidity",
"volatile acidity",
"citric acid",
"residual sugar",
"chlorides",
"free sulfur dioxide",
"total sulfur dioxide",
"density",
"pH",
"sulphates",
"alcohol",
"quality"
]
LABEL_MAP = {
0 : "Sangria",
1 : "Poor",
2 : "Below Average",
3: "Slightly Below Average",
4: "Average",
5: "Slightly Above Average",
6: "Above Average",
7: "Good",
8: "Excellent",
9: "Phenominal",
}
# Read the data into a DataFrame
df = pd.read_csv(DATA, sep=';', header=1, names=FEATURES)
# Convert class labels into text
for k,v in LABEL_MAP.items():
df.ix[df.quality == k, 'quality'] = v
# Describe the dataset
print(df.describe())
Here we do a count of the wines that are categorized under each of the possible qualities similar to what a histogram would convey.
In examining the distribution of the data we see that there are only a few wines towards the tails, 'Slightly Below Average' and 'Excellent' quality wines, so our algorithm will likely attempt to identify these outliers
In [25]:
# Determine the shape of the data
print("{} instances with {} features\n".format(*df.shape))
# Determine the frequency of each class
print(df.groupby('quality')['quality'].count())
In [26]:
# Create a scatter matrix of the dataframe features
from pandas.tools.plotting import scatter_matrix
scatter_matrix(df, alpha=0.2, figsize=(12, 12), diagonal='kde')
plt.show()
Here using Pandas we plot our data in lines.
Each line represents an instance from the dataset and the value of the instance for each of the features. The color represents the category of quality. This allows us to quickly glean much information that would traditionally come from summary statistics such as max and min, distribution of values for the features. It also gives some insight to common trends of the various color categories.
In [27]:
from pandas.tools.plotting import parallel_coordinates
plt.figure(figsize=(25,25))
parallel_coordinates(df, 'quality')
plt.show()
Here using Pandas we visualize our data to in a radial plot which normalizes our data and plots the instances relative to our features.
This is useful for us to look for clusters and trends happening in the multivariate layer of our dataset. Similar to how a scatter plot shows the interplay of two features this reflects the interaction of a higher dimension of features.
In [28]:
from pandas.tools.plotting import radviz
plt.figure(figsize=(12,12))
radviz(df, 'quality')
plt.show()
In [56]:
from sklearn.datasets.base import Bunch
DATA_DIR = os.path.abspath(os.path.join(".", "..", "MachineLearning", "data"))
# Show the contents of the data directory
for name in os.listdir(DATA_DIR):
if name.startswith("."): continue
print("- {}".format(name))
In [57]:
def load_data(root=DATA_DIR):
# Construct the `Bunch` for the wine dataset
filenames = {
'meta': os.path.join(root, 'meta.json'),
'rdme': os.path.join(root, 'README.md'),
'data': os.path.join(root, 'WineQuality.txt'),
}
# Load the meta data from the meta json
with open(filenames['meta'], 'r') as f:
meta = json.load(f)
target_names = meta['target_names']
feature_names = meta['feature_names']
# Load the description from the README.
with open(filenames['rdme'], 'r') as f:
DESCR = f.read()
# Load the dataset from the text file.
dataset = np.loadtxt(filenames['data'], delimiter = ";")
# Extract the target from the data
data = dataset[:, 0:-1]
target = dataset[:, -1]
# Create the bunch object
return Bunch(
data=data,
target=target,
filenames=filenames,
target_names=target_names,
feature_names=feature_names,
DESCR=DESCR
)
# Save the dataset as a variable we can use.
dataset = load_data()
print(dataset.data.shape)
print(dataset.target.shape)
In [58]:
from sklearn import metrics
from sklearn import cross_validation
from sklearn.cross_validation import KFold
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
In [59]:
def fit_and_evaluate(dataset, model, label, **kwargs):
"""
Because of the Scikit-Learn API, we can create a function to
do all of the fit and evaluate work on our behalf!
"""
start = time.time() # Start the clock!
scores = {'precision':[], 'recall':[], 'accuracy':[], 'f1':[]}
for train, test in KFold(dataset.data.shape[0], n_folds=12, shuffle=True):
X_train, X_test = dataset.data[train], dataset.data[test]
y_train, y_test = dataset.target[train], dataset.target[test]
estimator = model(**kwargs)
estimator.fit(X_train, y_train)
expected = y_test
predicted = estimator.predict(X_test)
# Append our scores to the tracker
scores['precision'].append(metrics.precision_score(expected, predicted, average="weighted"))
scores['recall'].append(metrics.recall_score(expected, predicted, average="weighted"))
scores['accuracy'].append(metrics.accuracy_score(expected, predicted))
scores['f1'].append(metrics.f1_score(expected, predicted, average="weighted"))
# Report
print("Build and Validation of {} took {:0.3f} seconds".format(label, time.time()-start))
print("Validation scores are as follows:\n")
print(pd.DataFrame(scores).mean())
# Write official estimator to disk
estimator = model(**kwargs)
estimator.fit(dataset.data, dataset.target)
outpath = label.lower().replace(" ", "-") + ".pickle"
with open(outpath, 'wb') as f:
pickle.dump(estimator, f)
print("\nFitted model written to:\n{}".format(os.path.abspath(outpath)))
In [67]:
# Perform SVC Classification
fit_and_evaluate(dataset, SVC, "Wine Quality SVM Classifier")
In [69]:
# Perform kNN Classification
fit_and_evaluate(dataset, KNeighborsClassifier, "Wine Quality kNN Classifier", n_neighbors=12)
In [70]:
# Perform Random Forest Classification
fit_and_evaluate(dataset, RandomForestClassifier, "Wine Quality Random Forest Classifier")
In [ ]: