In [230]:
# Read all the comments, the first approach is taken to highlight a wrong approach
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
%matplotlib inline

In [231]:
iris = datasets.load_iris()
iris_pd=pd.DataFrame(iris.data)

In [232]:
iris_pd.head()


Out[232]:
0 1 2 3
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2

In [233]:
iris.keys()


Out[233]:
dict_keys(['target', 'target_names', 'DESCR', 'feature_names', 'data'])

In [234]:
iris_pd.columns=iris.feature_names
iris_pd.head()


Out[234]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2

In [235]:
# The Iris Setosa from wiki pulled out
from IPython.display import Image
url = 'http://upload.wikimedia.org/wikipedia/commons/5/56/Kosaciec_szczecinkowaty_Iris_setosa.jpg'
Image(url,width=300, height=300)


Out[235]:

In [236]:
# The Iris Versicolor from Wiki pull out
from IPython.display import Image
url = 'http://upload.wikimedia.org/wikipedia/commons/4/41/Iris_versicolor_3.jpg'
Image(url,width=300, height=300)


Out[236]:

In [237]:
# The Iris Virginica from wiki pull out
from IPython.display import Image
url = 'http://upload.wikimedia.org/wikipedia/commons/9/9f/Iris_virginica.jpg'
Image(url,width=300, height=300)


Out[237]:

In [238]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [239]:
# Incorrect approch as we are not seperating the predicted variable
# from sklearn import svm
from sklearn.svm import SVC
svc_model = SVC()
svc_model.fit(X_train,y_train)


Out[239]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [240]:
predictions = svc_model.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))


[[13  0  0]
 [ 0 13  0]
 [ 0  0 12]]

In [241]:
# Print confusion Matrix

In [242]:
print(classification_report(y_test,predictions))
# Wrong prediction as we have not dropped the predicted variable


             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00        13
 versicolor       1.00      1.00      1.00        13
  virginica       1.00      1.00      1.00        12

avg / total       1.00      1.00      1.00        38


In [243]:
# Correct approach below
#import seaborn as sns
iris = sns.load_dataset('iris')

In [244]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [245]:
# Setosa is the most separable. 
sns.pairplot(iris,hue='species',palette='Dark2')


Out[245]:
<seaborn.axisgrid.PairGrid at 0x22085864b00>

In [246]:
from sklearn.model_selection import train_test_split

In [247]:
import seaborn as sns
iris = sns.load_dataset('iris')
X = iris.drop('species',axis=1)
y = iris['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [248]:
from sklearn.svm import SVC

In [249]:
svc_model = SVC()

In [250]:
svc_model.fit(X_train,y_train)


Out[250]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [251]:
predictions = svc_model.predict(X_test)

In [252]:
from sklearn.metrics import classification_report,confusion_matrix

In [253]:
print(confusion_matrix(y_test,predictions))


[[21  0  0]
 [ 0 13  1]
 [ 0  0 10]]

In [254]:
print(classification_report(y_test,predictions))


             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00        21
 versicolor       1.00      0.93      0.96        14
  virginica       0.91      1.00      0.95        10

avg / total       0.98      0.98      0.98        45


In [ ]:
# Good prediction after column dropped at 98%