In [1]:
%matplotlib inline
import matplotlib.colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
df = pd.read_csv('wine_data.csv')
In [2]:
# View data sample
df.ix[1:5]
Out[2]:
In [3]:
cmap = matplotlib.colors.ListedColormap(["red","cyan","blue"])
s = plt.scatter(df.ix[:, 1], df.ix[:, 11], c=df.ix[:, 0], cmap=cmap)
plt.xlabel('Alcohol')
plt.ylabel('Hue')
plt.show()
From the plot above, we can see that there isn't any significant linear relationship between the hue and alcohol content. However, a strategy like kNN might be useful for classification using only hue and alcohol since we can see distinct clusters, but with a slight overlap.
In [28]:
cmap = matplotlib.colors.ListedColormap(["red","cyan","blue"])
for i in range(1, 13):
for j in range(i+1, 14):
s = plt.scatter(df.ix[:, i], df.ix[:, j], c=df.ix[:, 0], cmap=cmap)
plt.xlabel(df.columns.values[i])
plt.ylabel(df.columns.values[j])
plt.show()
In [23]:
for i in range(1, 14):
s = plt.scatter(df.ix[:, i], df.ix[:, 0], cmap=cmap)
plt.xlabel(df.columns.values[i])
plt.ylabel('Category')
plt.show()
In [ ]:
In [11]:
df_norm = (df.ix[:, 1:] - df.ix[:, 1:].mean()) / (df.ix[:, 1:].max() - df.ix[:, 1:].min())
cov_mat = df_norm.cov()
In [6]:
print np.linalg.eigvals(cov_mat)
print np.diagonal(cov_mat)
It looks like applying PCA would be helpful given the difference in diagonal entries of current covariance matrix and the eigenvalues (the correlations after PCA).
In [72]:
from IPython.display import HTML
HTML('''
<script>
code_show=false;
function code_toggle() {
if (code_show){
$('div.input').show();
} else {
$('div.input').hide();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code"></form>''')
Out[72]:
In [ ]: