notebook.community

Edit and run



In [1]:

    
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os



In [2]:

    
# Dataset: https://archive.ics.uci.edu/ml/datasets/Iris/
# IRIS Dataset Size: 150 samples
# Train: 70%  Eval: 30%



In [3]:

    
df = pd.read_csv('../Data/ClassExamples/Iris/iris.data.csv')



In [4]:

    
df.head()









    Out[4]:







  
    
      
      sepal_length
      sepal_width
      petal_length
      petal_width
      class
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      Iris-setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      Iris-setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      Iris-setosa
    
    
      3
      4.6
      3.1
      1.5
      0.2
      Iris-setosa
    
    
      4
      5.0
      3.6
      1.4
      0.2
      Iris-setosa



In [5]:

    
df.tail()









    Out[5]:







  
    
      
      sepal_length
      sepal_width
      petal_length
      petal_width
      class
    
  
  
    
      145
      6.7
      3.0
      5.2
      2.3
      Iris-virginica
    
    
      146
      6.3
      2.5
      5.0
      1.9
      Iris-virginica
    
    
      147
      6.5
      3.0
      5.2
      2.0
      Iris-virginica
    
    
      148
      6.2
      3.4
      5.4
      2.3
      Iris-virginica
    
    
      149
      5.9
      3.0
      5.1
      1.8
      Iris-virginica



In [6]:

    
# Print first 5 index values and last 5 index values
index_list = list(df.index)
print(index_list[:5])
print(index_list[-5:])









    



[0, 1, 2, 3, 4]
[145, 146, 147, 148, 149]



In [7]:

    
# Randomize a list
np.random.seed(5)
np.random.shuffle(index_list)



In [8]:

    
print('Shuffled list')
print(index_list[:5])
print(index_list[-5:])









    



Shuffled list
[82, 134, 114, 42, 109]
[8, 73, 144, 118, 99]



In [9]:

    
# Dataframe is now shuffled
df = df.iloc[index_list]



In [10]:

    
df.head()









    Out[10]:







  
    
      
      sepal_length
      sepal_width
      petal_length
      petal_width
      class
    
  
  
    
      82
      5.8
      2.7
      3.9
      1.2
      Iris-versicolor
    
    
      134
      6.1
      2.6
      5.6
      1.4
      Iris-virginica
    
    
      114
      5.8
      2.8
      5.1
      2.4
      Iris-virginica
    
    
      42
      4.4
      3.2
      1.3
      0.2
      Iris-setosa
    
    
      109
      7.2
      3.6
      6.1
      2.5
      Iris-virginica



In [11]:

    
data_path = '../Data/ClassExamples/Iris/'

df.to_csv(os.path.join(data_path, 'iris_data_train.csv'),
          index = True,
          index_label = 'Row',
          columns=['sepal_length',
                   'sepal_width',
                   'petal_length',
                   'petal_width',
                   'class'])



In [12]:

    
df.to_csv(os.path.join(data_path,'iris_data_classifier_test.csv'),
          index = True,
          index_label = 'Row',
          columns=['sepal_length',
                   'sepal_width',
                   'petal_length',
                   'petal_width'])



In [13]:

    
df['class'].value_counts()









    Out[13]:





Iris-versicolor    50
Iris-setosa        50
Iris-virginica     50
Name: class, dtype: int64



In [14]:

    
df.describe()









    Out[14]:







  
    
      
      sepal_length
      sepal_width
      petal_length
      petal_width
    
  
  
    
      count
      150.000000
      150.000000
      150.000000
      150.000000
    
    
      mean
      5.843333
      3.054000
      3.758667
      1.198667
    
    
      std
      0.828066
      0.433594
      1.764420
      0.763161
    
    
      min
      4.300000
      2.000000
      1.000000
      0.100000
    
    
      25%
      5.100000
      2.800000
      1.600000
      0.300000
    
    
      50%
      5.800000
      3.000000
      4.350000
      1.300000
    
    
      75%
      6.400000
      3.300000
      5.100000
      1.800000
    
    
      max
      7.900000
      4.400000
      6.900000
      2.500000



In [15]:

    
# Find all rows that match the condition.  Returns a Series that contains index value and True/False
setosa = df['class'] == 'Iris-setosa'



In [16]:

    
setosa.head()









    Out[16]:





82     False
134    False
114    False
42      True
109    False
Name: class, dtype: bool



In [17]:

    
# Pull only setosa's
df[setosa].head()









    Out[17]:







  
    
      
      sepal_length
      sepal_width
      petal_length
      petal_width
      class
    
  
  
    
      42
      4.4
      3.2
      1.3
      0.2
      Iris-setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      Iris-setosa
    
    
      25
      5.0
      3.0
      1.6
      0.2
      Iris-setosa
    
    
      26
      5.0
      3.4
      1.6
      0.4
      Iris-setosa
    
    
      23
      5.1
      3.3
      1.7
      0.5
      Iris-setosa



In [18]:

    
plt.hist(df.petal_length, )
plt.grid(True)
plt.xlabel('Petal Length')
plt.ylabel('Count')
plt.title('Petal Length Histogram')









    Out[18]:





<matplotlib.text.Text at 0x1dd5bca7390>



In [19]:

    
plt.scatter(df.index, df.petal_length)
plt.grid(True)
plt.xlabel('Sample')
plt.ylabel('Petal Length')
plt.title('Petal Length Scatter Plot')









    Out[19]:





<matplotlib.text.Text at 0x1dd5bec27b8>

	sepal_length	sepal_width	petal_length	petal_width	class
0	5.1	3.5	1.4	0.2	Iris-setosa
1	4.9	3.0	1.4	0.2	Iris-setosa
2	4.7	3.2	1.3	0.2	Iris-setosa
3	4.6	3.1	1.5	0.2	Iris-setosa
4	5.0	3.6	1.4	0.2	Iris-setosa

	sepal_length	sepal_width	petal_length	petal_width	class
145	6.7	3.0	5.2	2.3	Iris-virginica
146	6.3	2.5	5.0	1.9	Iris-virginica
147	6.5	3.0	5.2	2.0	Iris-virginica
148	6.2	3.4	5.4	2.3	Iris-virginica
149	5.9	3.0	5.1	1.8	Iris-virginica

	sepal_length	sepal_width	petal_length	petal_width	class
82	5.8	2.7	3.9	1.2	Iris-versicolor
134	6.1	2.6	5.6	1.4	Iris-virginica
114	5.8	2.8	5.1	2.4	Iris-virginica
42	4.4	3.2	1.3	0.2	Iris-setosa
109	7.2	3.6	6.1	2.5	Iris-virginica

	sepal_length	sepal_width	petal_length	petal_width
count	150.000000	150.000000	150.000000	150.000000
mean	5.843333	3.054000	3.758667	1.198667
std	0.828066	0.433594	1.764420	0.763161
min	4.300000	2.000000	1.000000	0.100000
25%	5.100000	2.800000	1.600000	0.300000
50%	5.800000	3.000000	4.350000	1.300000
75%	6.400000	3.300000	5.100000	1.800000
max	7.900000	4.400000	6.900000	2.500000