In [205]:
from sklearn.datasets import load_iris
from sklearn import preprocessing

import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib
matplotlib.style.use("ggplot")


Using matplotlib backend: MacOSX

In [206]:
# Load data and repair the header
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
target_names = [data.target_names[i] for i in data.target]
df["target_names"] = target_names
columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "target_names"]
df.columns = columns
print(df.head())
print(df.describe())


   sepal_length  sepal_width  petal_length  petal_width target_names
0           5.1          3.5           1.4          0.2       setosa
1           4.9          3.0           1.4          0.2       setosa
2           4.7          3.2           1.3          0.2       setosa
3           4.6          3.1           1.5          0.2       setosa
4           5.0          3.6           1.4          0.2       setosa
       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.054000      3.758667     1.198667
std        0.828066     0.433594      1.764420     0.763161
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000

In [207]:
# Convert data
#df = pd.get_dummies(df, columns=["target_names"])
print(df.dtypes)


sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
target_names     object
dtype: object

In [208]:
# Clean data
#print(df[df.sepal_length.isnull()])
#df = df.fillna(0.0, axis=0).reindex()

In [209]:
df_norm = df.iloc[:, 0:4]

scaler = preprocessing.MinMaxScaler()
np_scaled = scaler.fit_transform(df_norm)
df_norm = pd.DataFrame(np_scaled)
df_norm["target_names"] = target_names
df_norm.columns = columns
df_norm.describe()


Out[209]:
sepal_length sepal_width petal_length petal_width
count 150.000000 150.000000 150.000000 150.000000
mean 0.428704 0.439167 0.467571 0.457778
std 0.230018 0.180664 0.299054 0.317984
min 0.000000 0.000000 0.000000 0.000000
25% 0.222222 0.333333 0.101695 0.083333
50% 0.416667 0.416667 0.567797 0.500000
75% 0.583333 0.541667 0.694915 0.708333
max 1.000000 1.000000 1.000000 1.000000

In [210]:
pd.tools.plotting.parallel_coordinates(df_norm, "target_names")


Out[210]:
<matplotlib.axes._subplots.AxesSubplot at 0x119157160>

In [211]:
pd.tools.plotting.parallel_coordinates(df, "target_names")


Out[211]:
<matplotlib.axes._subplots.AxesSubplot at 0x119157160>

In [ ]: