Iris Project

Data Exploration and Analysis



In [4]:

    
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes=True)
%matplotlib inline



In [5]:

    
df = pd.read_csv('iris.data')



In [6]:

    
df.head()









    Out[6]:






  
    
      
      5.1
      3.5
      1.4
      0.2
      Iris-setosa
    
  
  
    
      0
      4.9
      3.0
      1.4
      0.2
      Iris-setosa
    
    
      1
      4.7
      3.2
      1.3
      0.2
      Iris-setosa
    
    
      2
      4.6
      3.1
      1.5
      0.2
      Iris-setosa
    
    
      3
      5.0
      3.6
      1.4
      0.2
      Iris-setosa
    
    
      4
      5.4
      3.9
      1.7
      0.4
      Iris-setosa



In [7]:

    
pd.read_csv?



In [8]:

    
df = pd.read_csv('iris.data', header=-1)
df.head()









    Out[8]:






  
    
      
      0
      1
      2
      3
      4
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      Iris-setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      Iris-setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      Iris-setosa
    
    
      3
      4.6
      3.1
      1.5
      0.2
      Iris-setosa
    
    
      4
      5.0
      3.6
      1.4
      0.2
      Iris-setosa



In [9]:

    
col_name = ['sepal length', 'sepal width', 'petal length', 'petal width', 'class']



In [10]:

    
df.columns = col_name



In [11]:

    
df.head()









    Out[11]:






  
    
      
      sepal length
      sepal width
      petal length
      petal width
      class
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      Iris-setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      Iris-setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      Iris-setosa
    
    
      3
      4.6
      3.1
      1.5
      0.2
      Iris-setosa
    
    
      4
      5.0
      3.6
      1.4
      0.2
      Iris-setosa

Iris Data from Seaborn



In [12]:

    
iris = sns.load_dataset('iris')
iris.head()









    Out[12]:






  
    
      
      sepal_length
      sepal_width
      petal_length
      petal_width
      species
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      setosa
    
    
      3
      4.6
      3.1
      1.5
      0.2
      setosa
    
    
      4
      5.0
      3.6
      1.4
      0.2
      setosa



In [13]:

    
df.describe()









    Out[13]:






  
    
      
      sepal length
      sepal width
      petal length
      petal width
    
  
  
    
      count
      150.000000
      150.000000
      150.000000
      150.000000
    
    
      mean
      5.843333
      3.054000
      3.758667
      1.198667
    
    
      std
      0.828066
      0.433594
      1.764420
      0.763161
    
    
      min
      4.300000
      2.000000
      1.000000
      0.100000
    
    
      25%
      5.100000
      2.800000
      1.600000
      0.300000
    
    
      50%
      5.800000
      3.000000
      4.350000
      1.300000
    
    
      75%
      6.400000
      3.300000
      5.100000
      1.800000
    
    
      max
      7.900000
      4.400000
      6.900000
      2.500000



In [14]:

    
iris.describe()









    Out[14]:






  
    
      
      sepal_length
      sepal_width
      petal_length
      petal_width
    
  
  
    
      count
      150.000000
      150.000000
      150.000000
      150.000000
    
    
      mean
      5.843333
      3.057333
      3.758000
      1.199333
    
    
      std
      0.828066
      0.435866
      1.765298
      0.762238
    
    
      min
      4.300000
      2.000000
      1.000000
      0.100000
    
    
      25%
      5.100000
      2.800000
      1.600000
      0.300000
    
    
      50%
      5.800000
      3.000000
      4.350000
      1.300000
    
    
      75%
      6.400000
      3.300000
      5.100000
      1.800000
    
    
      max
      7.900000
      4.400000
      6.900000
      2.500000



In [15]:

    
print(iris.info())









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal_length    150 non-null float64
sepal_width     150 non-null float64
petal_length    150 non-null float64
petal_width     150 non-null float64
species         150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.3+ KB
None



In [16]:

    
print(iris.groupby('species').size())









    



species
setosa        50
versicolor    50
virginica     50
dtype: int64

Visualisation



In [17]:

    
sns.pairplot(iris, hue='species', size=3, aspect=1.0)









    Out[17]:





<seaborn.axisgrid.PairGrid at 0xa9071d0>



In [18]:

    
iris.hist(edgecolor='black', linewidth=1.2, figsize=(12, 8))
plt.show()



In [19]:

    
iris.hist?



In [20]:

    
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
sns.violinplot(x='species', y='sepal_length', data=iris)
plt.subplot(2, 2, 2)
sns.violinplot(x='species', y='sepal_width', data=iris)
plt.subplot(2, 2, 3)
sns.violinplot(x='species', y='petal_length', data=iris)
plt.subplot(2, 2, 4)
sns.violinplot(x='species', y='petal_width', data=iris)









    Out[20]:





<matplotlib.axes._subplots.AxesSubplot at 0xb6c6e90>



In [21]:

    
iris.boxplot(by='species', figsize=(12, 8))
plt.show()



In [23]:

    
pd.scatter_matrix(iris, figsize=(12, 8))
plt.show()



In [24]:

    
iris.head()









    Out[24]:






  
    
      
      sepal_length
      sepal_width
      petal_length
      petal_width
      species
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      setosa
    
    
      3
      4.6
      3.1
      1.5
      0.2
      setosa
    
    
      4
      5.0
      3.6
      1.4
      0.2
      setosa

Key Points

Data in the form of a table
Features in the form of a matrix
Label or target array

Scikit-Learn API

General Principles

Consistency. All objects (basic or composite) share a consistent interface composed of a limited set of methods. This interface is documented in a consistent manner for all objects.
Inspection. Constructor parameters and parameter values determined by learning algorithms are stored and exposed as public attributes.
Non-proliferation of classes. Learning algorithms are the only objects to be represented using custom classes. Datasets are represented as NumPy arrays or SciPy sparse matrices. Hyper-parameter names and values are represented as standard Python strings or numbers whenever possible. This keeps scikitlearn easy to use and easy to combine with other libraries.
Composition. Many machine learning tasks are expressible as sequences or combinations of transformations to data. Some learning algorithms are also naturally viewed as meta-algorithms parametrized on other algorithms. Whenever feasible, such algorithms are implemented and composed from existing building blocks.
Sensible defaults. Whenever an operation requires a user-defined parameter, an appropriate default value is defined by the library. The default value should cause the operation to be performed in a sensible way (giving a baseline solution for the task at hand).

Basic Steps of Using Scikit-Learn API

Choose a class of model
Choose model hyperparameters
Arrange data into feature matrix and target array
Fit model to data
Apply trained model to new data

Supervised Learning: Simple Linear Regression



In [25]:

    
x = 10 * np.random.rand(100)



In [26]:

    
y = 3 * x + np.random.rand(100)



In [27]:

    
plt.scatter(x, y)









    Out[27]:





<matplotlib.collections.PathCollection at 0x2bda810>



In [29]:

    
from sklearn.linear_model import LinearRegression



In [30]:

    
model = LinearRegression(fit_intercept=True)



In [31]:

    
model









    Out[31]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [32]:

    
X = x.reshape(-1, 1)
X.shape









    Out[32]:





(100, 1)



In [33]:

    
model.fit(X, y)









    Out[33]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [34]:

    
model.coef_









    Out[34]:





array([ 3.01975213])



In [35]:

    
model.intercept_









    Out[35]:





0.450644289723515



In [36]:

    
x_fit = np.linspace(-1, 11)



In [37]:

    
X_fit = x_fit.reshape(-1, 1)



In [38]:

    
y_fit = model.predict(X_fit)



In [39]:

    
plt.scatter(x, y)
plt.plot(x_fit, y_fit)









    Out[39]:





[<matplotlib.lines.Line2D at 0xb72f750>]



In [ ]:

	5.1	3.5	1.4	0.2	Iris-setosa
0	4.9	3.0	1.4	0.2	Iris-setosa
1	4.7	3.2	1.3	0.2	Iris-setosa
2	4.6	3.1	1.5	0.2	Iris-setosa
3	5.0	3.6	1.4	0.2	Iris-setosa
4	5.4	3.9	1.7	0.4	Iris-setosa

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

	sepal length	sepal width	petal length	petal width
count	150.000000	150.000000	150.000000	150.000000
mean	5.843333	3.054000	3.758667	1.198667
std	0.828066	0.433594	1.764420	0.763161
min	4.300000	2.000000	1.000000	0.100000
25%	5.100000	2.800000	1.600000	0.300000
50%	5.800000	3.000000	4.350000	1.300000
75%	6.400000	3.300000	5.100000	1.800000
max	7.900000	4.400000	6.900000	2.500000