In [1]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os
In [2]:
# Dataset: https://archive.ics.uci.edu/ml/datasets/Iris/
# IRIS Dataset Size: 150 samples
# Train: 70% Eval: 30%
In [3]:
df = pd.read_csv('../Data/ClassExamples/Iris/iris.data.csv')
In [4]:
df.head()
Out[4]:
In [5]:
df.tail()
Out[5]:
In [6]:
# Print first 5 index values and last 5 index values
index_list = list(df.index)
print(index_list[:5])
print(index_list[-5:])
In [7]:
# Randomize a list
np.random.seed(5)
np.random.shuffle(index_list)
In [8]:
print('Shuffled list')
print(index_list[:5])
print(index_list[-5:])
In [9]:
# Dataframe is now shuffled
df = df.iloc[index_list]
In [10]:
df.head()
Out[10]:
In [11]:
data_path = '../Data/ClassExamples/Iris/'
df.to_csv(os.path.join(data_path, 'iris_data_train.csv'),
index = True,
index_label = 'Row',
columns=['sepal_length',
'sepal_width',
'petal_length',
'petal_width',
'class'])
In [12]:
df.to_csv(os.path.join(data_path,'iris_data_classifier_test.csv'),
index = True,
index_label = 'Row',
columns=['sepal_length',
'sepal_width',
'petal_length',
'petal_width'])
In [13]:
df['class'].value_counts()
Out[13]:
In [14]:
df.describe()
Out[14]:
In [15]:
# Find all rows that match the condition. Returns a Series that contains index value and True/False
setosa = df['class'] == 'Iris-setosa'
In [16]:
setosa.head()
Out[16]:
In [17]:
# Pull only setosa's
df[setosa].head()
Out[17]:
In [18]:
plt.hist(df.petal_length, )
plt.grid(True)
plt.xlabel('Petal Length')
plt.ylabel('Count')
plt.title('Petal Length Histogram')
Out[18]:
In [19]:
plt.scatter(df.index, df.petal_length)
plt.grid(True)
plt.xlabel('Sample')
plt.ylabel('Petal Length')
plt.title('Petal Length Scatter Plot')
Out[19]: