In [8]:
from sklearn.datasets import load_iris
irisdata = load_iris()
In [54]:
import pandas as pd
features = pd.DataFrame(irisdata['data'])
features.columns = irisdata['feature_names']
targets = pd.DataFrame(irisdata['target'])
targets = targets.replace([0,1,2],irisdata['target_names'])
In [55]:
features.isnull().sum()
Out[55]:
In [56]:
targets.isnull().sum()
Out[56]:
In [57]:
targets[0].unique()
Out[57]:
In [58]:
features.shape
Out[58]:
In [66]:
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
for col in targets.columns:
targets[col] = labelencoder.fit_transform(targets[col])
In [64]:
targets[0].unique()
Out[64]:
In [65]:
print(targets.groupby(0).size())
In [68]:
import matplotlib.pyplot as plt
%matplotlib inline
fig,axes = plt.subplots(nrows=2,ncols=2,figsize=(9,9))
fig1 = axes[0,0].boxplot(features['sepal length (cm)'],patch_artist=True)
fig2 = axes[0,1].boxplot(features['sepal width (cm)'],patch_artist=True)
fig3 = axes[1,0].boxplot(features['petal length (cm)'],patch_artist=True)
fig4 = axes[1,1].boxplot(features['petal width (cm)'],patch_artist=True)
In [69]:
features.describe()
Out[69]:
In [70]:
features.corr()
Out[70]:
In [72]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(features)
X
Out[72]:
In [74]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit_transform(X)
Out[74]:
In [76]:
covariance = pca.get_covariance()
explained_variance = pca.explained_variance_
explained_variance
Out[76]:
In [78]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(6, 4))
plt.bar(range(4), explained_variance, alpha=0.5, align='center',
label='individual explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.tight_layout()
The last 1 componen has less amount of variance of the data. The first 3 components retains more than 90% of the data.(Here, compared with only 4 features, there're enough instances to support the final results. We shall take all features into consideration)
In [116]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
x_pca = pca.fit_transform(X)
kmeans = KMeans(n_clusters=3, random_state=5)
x_clustered = kmeans.fit_predict(x_pca)
y = targets.values
y = y.reshape(y.size)
In [119]:
import matplotlib.pyplot as plt
%matplotlib inline
LABEL_COLOR_MAP = {0 : 'g',
1 : 'y',
2 : 'r'
}
label_color = [LABEL_COLOR_MAP[i] for i in x_clustered]
y_color = [LABEL_COLOR_MAP[i] for i in y]
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(5,3))
axes[0].scatter(X[:,0],X[:,1], c= label_color)
axes[0].set_title('PCA')
axes[1].scatter(X[:,0],X[:,1], c= y_color)
axes[1].set_title('True Cluster');
Using K-means, we are able to segregate 3 classes well using the first 3 components with maximum variance. (Don't mind the color type, which is meaningless in clustering).
You can apply PCA firstly before using machine learning in the next steps
In [122]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=4)
In [129]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics
modelLR = LogisticRegression(n_jobs=-1)
modelLR.fit(X_train,y_train);
In [130]:
y_pred = modelLR.predict(X_test)
modelLR.score(X_test,y_pred)
Out[130]:
In [131]:
confusion_matrix=metrics.confusion_matrix(y_test,y_pred)
confusion_matrix
Out[131]:
In [147]:
import matplotlib.pyplot as plt
%matplotlib inline
LABEL_COLOR_MAP = {0 : 'g',
1 : 'y',
2 : 'r'
}
pred_color = [LABEL_COLOR_MAP[i] for i in y_pred]
test_color = [LABEL_COLOR_MAP[i] for i in y_test]
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(5,2))
axes[0].scatter(X_test[:,0],X_test[:,1], c= pred_color)
axes[0].set_title('Predicted')
axes[1].scatter(X_test[:,0],X_test[:,1], c= test_color)
axes[1].set_title('True');
In [137]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
LRs= LogisticRegression()
tuned_parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] ,
'penalty':['l1','l2']
}
modelLR=GridSearchCV(LRs, tuned_parameters,cv=10)
Search best combinations of parameter values based on the dataset.
In [140]:
modelLR.fit(X_train,y_train)
Out[140]:
In [141]:
print(modelLR.best_params_)
In [142]:
y_pred = modelLR.predict(X_test)
modelLR.score(X_test,y_pred)
Out[142]:
In [143]:
confusion_matrix=metrics.confusion_matrix(y_test,y_pred)
confusion_matrix
Out[143]:
In [144]:
auc_roc=metrics.classification_report(y_test,y_pred)
auc_roc
Out[144]:
In [148]:
import matplotlib.pyplot as plt
%matplotlib inline
LABEL_COLOR_MAP = {0 : 'g',
1 : 'y',
2 : 'r'
}
pred_color = [LABEL_COLOR_MAP[i] for i in y_pred]
test_color = [LABEL_COLOR_MAP[i] for i in y_test]
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(5,2))
axes[0].scatter(X_test[:,0],X_test[:,1], c= pred_color)
axes[0].set_title('Predicted')
axes[1].scatter(X_test[:,0],X_test[:,1], c= test_color)
axes[1].set_title('True');
In [149]:
from sklearn.svm import SVC
svm= SVC()
tuned_parameters = {
'C': [1, 10, 100,500, 1000], 'kernel': ['linear','rbf'],
'C': [1, 10, 100,500, 1000], 'gamma': [1,0.1,0.01,0.001, 0.0001], 'kernel': ['rbf'],
#'degree': [2,3,4,5,6] , 'C':[1,10,100,500,1000] , 'kernel':['poly']
}
In [165]:
from sklearn.model_selection import RandomizedSearchCV
modelsvm = RandomizedSearchCV(svm, tuned_parameters,cv=10,scoring='accuracy',n_iter=20)
In [166]:
modelsvm.fit(X_train, y_train)
print(modelsvm.best_score_)
In [170]:
modelsvm.cv_results_
Out[170]:
In [155]:
print(modelsvm.best_params_)
In [157]:
y_pred= modelsvm.predict(X_test)
print(metrics.accuracy_score(y_pred,y_test))
In [158]:
confusion_matrix=metrics.confusion_matrix(y_test,y_pred)
confusion_matrix
Out[158]:
In [159]:
auc_roc=metrics.classification_report(y_test,y_pred)
auc_roc
Out[159]:
In [160]:
import matplotlib.pyplot as plt
%matplotlib inline
LABEL_COLOR_MAP = {0 : 'g',
1 : 'y',
2 : 'r'
}
pred_color = [LABEL_COLOR_MAP[i] for i in y_pred]
test_color = [LABEL_COLOR_MAP[i] for i in y_test]
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(5,2))
axes[0].scatter(X_test[:,0],X_test[:,1], c= pred_color)
axes[0].set_title('Predicted')
axes[1].scatter(X_test[:,0],X_test[:,1], c= test_color)
axes[1].set_title('True');
In [ ]: