In [1]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os

In [2]:
# Dataset: https://archive.ics.uci.edu/ml/datasets/Iris/
# IRIS Dataset Size: 150 samples
# Train: 70%  Eval: 30%

In [3]:
df = pd.read_csv('../Data/ClassExamples/Iris/iris.data.csv')

In [4]:
df.head()


Out[4]:
sepal_length sepal_width petal_length petal_width class
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa

In [5]:
df.tail()


Out[5]:
sepal_length sepal_width petal_length petal_width class
145 6.7 3.0 5.2 2.3 Iris-virginica
146 6.3 2.5 5.0 1.9 Iris-virginica
147 6.5 3.0 5.2 2.0 Iris-virginica
148 6.2 3.4 5.4 2.3 Iris-virginica
149 5.9 3.0 5.1 1.8 Iris-virginica

In [6]:
# Print first 5 index values and last 5 index values
index_list = list(df.index)
print(index_list[:5])
print(index_list[-5:])


[0, 1, 2, 3, 4]
[145, 146, 147, 148, 149]

In [7]:
# Randomize a list
np.random.seed(5)
np.random.shuffle(index_list)

In [8]:
print('Shuffled list')
print(index_list[:5])
print(index_list[-5:])


Shuffled list
[82, 134, 114, 42, 109]
[8, 73, 144, 118, 99]

In [9]:
# Dataframe is now shuffled
df = df.iloc[index_list]

In [10]:
df.head()


Out[10]:
sepal_length sepal_width petal_length petal_width class
82 5.8 2.7 3.9 1.2 Iris-versicolor
134 6.1 2.6 5.6 1.4 Iris-virginica
114 5.8 2.8 5.1 2.4 Iris-virginica
42 4.4 3.2 1.3 0.2 Iris-setosa
109 7.2 3.6 6.1 2.5 Iris-virginica

In [11]:
data_path = '../Data/ClassExamples/Iris/'

df.to_csv(os.path.join(data_path, 'iris_data_train.csv'),
          index = True,
          index_label = 'Row',
          columns=['sepal_length',
                   'sepal_width',
                   'petal_length',
                   'petal_width',
                   'class'])

In [12]:
df.to_csv(os.path.join(data_path,'iris_data_classifier_test.csv'),
          index = True,
          index_label = 'Row',
          columns=['sepal_length',
                   'sepal_width',
                   'petal_length',
                   'petal_width'])

In [13]:
df['class'].value_counts()


Out[13]:
Iris-versicolor    50
Iris-setosa        50
Iris-virginica     50
Name: class, dtype: int64

In [14]:
df.describe()


Out[14]:
sepal_length sepal_width petal_length petal_width
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.054000 3.758667 1.198667
std 0.828066 0.433594 1.764420 0.763161
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000

In [15]:
# Find all rows that match the condition.  Returns a Series that contains index value and True/False
setosa = df['class'] == 'Iris-setosa'

In [16]:
setosa.head()


Out[16]:
82     False
134    False
114    False
42      True
109    False
Name: class, dtype: bool

In [17]:
# Pull only setosa's
df[setosa].head()


Out[17]:
sepal_length sepal_width petal_length petal_width class
42 4.4 3.2 1.3 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
25 5.0 3.0 1.6 0.2 Iris-setosa
26 5.0 3.4 1.6 0.4 Iris-setosa
23 5.1 3.3 1.7 0.5 Iris-setosa

In [18]:
plt.hist(df.petal_length, )
plt.grid(True)
plt.xlabel('Petal Length')
plt.ylabel('Count')
plt.title('Petal Length Histogram')


Out[18]:
<matplotlib.text.Text at 0x1dd5bca7390>

In [19]:
plt.scatter(df.index, df.petal_length)
plt.grid(True)
plt.xlabel('Sample')
plt.ylabel('Petal Length')
plt.title('Petal Length Scatter Plot')


Out[19]:
<matplotlib.text.Text at 0x1dd5bec27b8>