Written by: Neeraj Asthana (under Professor Robert Brunner)
University of Illinois at Urbana-Champaign
Summer 2016
Dataset found on UCI Machine Learning repository at: https://archive.ics.uci.edu/ml/datasets/seeds
This data set tries to cluster seeds (type) using many different continous predcitors.
A description of the dataset can be found at: https://archive.ics.uci.edu/ml/datasets/seeds
Predictors:
In [3]:
#Libraries and Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pylab
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.decomposition import PCA
In [5]:
#Names of all of the columns
names = [
'area'
, 'perimeter'
, 'compactness'
, 'length'
, 'width'
, 'asymmetry'
, 'groove_length'
, 'class'
]
#Import dataset
data = pd.read_csv('seeds_dataset.txt', sep = '\t', header = None, names = names)
data.head(10)
Out[5]:
In [6]:
data.shape
Out[6]:
In [7]:
#Fill NaN with the mean of the column
#data_clean=data.applymap(lambda x: np.nan if x == '?' else x).dropna()
data_clean = data.fillna(data.mean())
data_clean.shape
Out[7]:
In [8]:
#Select Predictor columns
X = data_clean.ix[:,:-1]
#Scale X so that all columns have the same mean and variance
X_scaled = preprocessing.scale(X)
#Select target column
y = data_clean['class']
y.value_counts()
Out[8]:
In [9]:
# Visualize dataset with scatterplot matrix
%matplotlib inline
g = sns.PairGrid(data_clean, hue="class")
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter)
Out[9]:
In [10]:
#train a k-nearest neighbor algorithm
fit = KMeans(n_clusters=3).fit(X_scaled)
fit.labels_
Out[10]:
In [11]:
#remake labels so that they properly matchup with the classes
labels = fit.labels_[:]
for index,val in enumerate(labels):
if val == 1:
labels[index] = 1
elif val == 2:
labels[index] = 3
else:
labels[index] = 2
labels
Out[11]:
In [12]:
conf_mat = np.zeros((3,3))
for i,val in enumerate(y):
conf_mat[val-1,labels[i]-1] += 1
#true vs. predicted
print(pd.DataFrame(conf_mat))
In [15]:
%matplotlib inline
#Scree plot
U, S, V = np.linalg.svd(X)
eigvals = S**2 / np.cumsum(S)[-1]
fig = plt.figure(figsize=(8,6))
sing_vals = np.arange(7) + 1
plt.plot(sing_vals, eigvals, 'ro-', linewidth=2)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Eigenvalue')
plt.show()
In [27]:
pca = PCA(n_components=2)
pca.fit(X)
X_pca = pca.transform(X)
plt.scatter(X_pca[:,0],X_pca[:,1], c=y, cmap=pylab.cm.cool)
Out[27]:
In [28]:
pca_fit = KMeans(n_clusters=3).fit(X_pca)
pca_fit.labels_
Out[28]:
In [29]:
#remake labels so that they properly matchup with the classes
pca_labels = pca_fit.labels_[:]
for index,val in enumerate(pca_labels):
if val == 0:
pca_labels[index] = 1
elif val == 1:
pca_labels[index] = 3
else:
pca_labels[index] = 2
labels
pca_conf_mat = np.zeros((3,3))
for i,val in enumerate(y):
pca_conf_mat[val-1,pca_labels[i]-1] += 1
#true vs. predicted
print(pd.DataFrame(pca_conf_mat))
Read in file
Handle missing values (ex. ?, NA, etc.)
Select columns for the regression tasks
Transform columns or variables
Cluster using K-Means
Perform diagnostics on the model
Visualizations