In [1]:
#This is a notebook containing the code of blogpost http://ataspinar.com/2017/05/26/classification-with-scikit-learn/
#Although I'll also give a short description in this notebook, for a full explanation you should read the blog.
# Lets import some modules for basic computation
import time
import pandas as pd
import numpy as np
import pickle
# Some modules for plotting and visualizing
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
# And some Machine Learning modules from scikit-learn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
#These Classifiers have been commented out because they take too long and do not give more accuracy as the other ones.
#from sklearn.ensemble import AdaBoostClassifier
#from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
#from sklearn.gaussian_process import GaussianProcessClassifier
In [2]:
dict_classifiers = {
"Logistic Regression": LogisticRegression(),
"Nearest Neighbors": KNeighborsClassifier(),
"Linear SVM": SVC(),
"Gradient Boosting Classifier": GradientBoostingClassifier(n_estimators=1000),
"Decision Tree": tree.DecisionTreeClassifier(),
"Random Forest": RandomForestClassifier(n_estimators=1000),
"Neural Net": MLPClassifier(alpha = 1),
"Naive Bayes": GaussianNB(),
#"AdaBoost": AdaBoostClassifier(),
#"QDA": QuadraticDiscriminantAnalysis(),
#"Gaussian Process": GaussianProcessClassifier()
}
def batch_classify(X_train, Y_train, X_test, Y_test, no_classifiers = 5, verbose = True):
"""
This method, takes as input the X, Y matrices of the Train and Test set.
And fits them on all of the Classifiers specified in the dict_classifier.
The trained models, and accuracies are saved in a dictionary. The reason to use a dictionary
is because it is very easy to save the whole dictionary with the pickle module.
Usually, the SVM, Random Forest and Gradient Boosting Classifier take quiet some time to train.
So it is best to train them on a smaller dataset first and
decide whether you want to comment them out or not based on the test accuracy score.
"""
dict_models = {}
for classifier_name, classifier in list(dict_classifiers.items())[:no_classifiers]:
t_start = time.clock()
classifier.fit(X_train, Y_train)
t_end = time.clock()
t_diff = t_end - t_start
train_score = classifier.score(X_train, Y_train)
test_score = classifier.score(X_test, Y_test)
dict_models[classifier_name] = {'model': classifier, 'train_score': train_score, 'test_score': test_score, 'train_time': t_diff}
if verbose:
print("trained {c} in {f:.2f} s".format(c=classifier_name, f=t_diff))
return dict_models
def label_encode(df, list_columns):
"""
This method one-hot encodes all column, specified in list_columns
"""
for col in list_columns:
le = LabelEncoder()
col_values_unique = list(df[col].unique())
le_fitted = le.fit(col_values_unique)
col_values = list(df[col].values)
le.classes_
col_values_transformed = le.transform(col_values)
df[col] = col_values_transformed
def expand_columns(df, list_columns):
for col in list_columns:
colvalues = df[col].unique()
for colvalue in colvalues:
newcol_name = "{}_is_{}".format(col, colvalue)
df.loc[df[col] == colvalue, newcol_name] = 1
df.loc[df[col] != colvalue, newcol_name] = 0
df.drop(list_columns, inplace=True, axis=1)
def get_train_test(df, y_col, x_cols, ratio):
"""
This method transforms a dataframe into a train and test set, for this you need to specify:
1. the ratio train : test (usually 0.7)
2. the column with the Y_values
"""
mask = np.random.rand(len(df)) < ratio
df_train = df[mask]
df_test = df[~mask]
Y_train = df_train[y_col].values
Y_test = df_test[y_col].values
X_train = df_train[x_cols].values
X_test = df_test[x_cols].values
return df_train, df_test, X_train, Y_train, X_test, Y_test
def display_dict_models(dict_models, sort_by='test_score'):
cls = [key for key in dict_models.keys()]
test_s = [dict_models[key]['test_score'] for key in cls]
training_s = [dict_models[key]['train_score'] for key in cls]
training_t = [dict_models[key]['train_time'] for key in cls]
df_ = pd.DataFrame(data=np.zeros(shape=(len(cls),4)), columns = ['classifier', 'train_score', 'test_score', 'train_time'])
for ii in range(0,len(cls)):
df_.loc[ii, 'classifier'] = cls[ii]
df_.loc[ii, 'train_score'] = training_s[ii]
df_.loc[ii, 'test_score'] = test_s[ii]
df_.loc[ii, 'train_time'] = training_t[ii]
display(df_.sort_values(by=sort_by, ascending=False))
def display_corr_with_col(df, col):
correlation_matrix = df.corr()
correlation_type = correlation_matrix[col].copy()
abs_correlation_type = correlation_type.apply(lambda x: abs(x))
desc_corr_values = abs_correlation_type.sort_values(ascending=False)
y_values = list(desc_corr_values.values)[1:]
x_values = range(0,len(y_values))
xlabels = list(desc_corr_values.keys())[1:]
fig, ax = plt.subplots(figsize=(8,8))
ax.bar(x_values, y_values)
ax.set_title('The correlation of all features with {}'.format(col), fontsize=20)
ax.set_ylabel('Pearson correlatie coefficient [abs waarde]', fontsize=16)
plt.xticks(x_values, xlabels, rotation='vertical')
plt.show()
In [3]:
filename_glass = '../datasets/glass.csv'
In [4]:
df_glass = pd.read_csv(filename_glass)
print("This dataset has nrows, ncols: {}".format(df_glass.shape))
display(df_glass.head())
display(df_glass.describe())
In [5]:
y_col_glass = 'Type'
x_cols_glass = list(df_glass.columns.values)
x_cols_glass.remove(y_col_glass)
train_test_ratio = 0.7
df_train, df_test, X_train, Y_train, X_test, Y_test = get_train_test(df_glass, y_col_glass, x_cols_glass, train_test_ratio)
dict_models = batch_classify(X_train, Y_train, X_test, Y_test, no_classifiers = 8)
display_dict_models(dict_models)
In [6]:
filename_mushrooms = '../datasets/mushrooms.csv'
df_mushrooms = pd.read_csv(filename_mushrooms)
display(df_mushrooms.head())
In [7]:
for col in df_mushrooms.columns.values:
print(col, df_mushrooms[col].unique())
In [8]:
for col in df_mushrooms.columns.values:
if len(df_mushrooms[col].unique()) <= 1:
print("Removing column {}, which only contains the value: {}".format(col, df_mushrooms[col].unique()[0]))
Some datasets contain missing values like NaN, null, NULL, '?', '??' etc
It could be that all missing values are of type NaN, or that some columns contain NaN and other columns contain missing data in the form of '??'.
It is up to your best judgement to decide what to do with these missing values. What is most effective, really depends on the type of data, the type of missing data and the ratio between missing data and non-missing data.
If the number of rows containing missing data is only a few percent of the total dataset, the best option could be to drop those rows.
If there is a column which contains almost all missing data, it will not have much added value and it might be best to drop that column.
It could be that a value not being filled in also is information which helps with the classification and it is best to leave it like it is.
etc
In [9]:
print("Number of rows in total: {}".format(df_mushrooms.shape[0]))
print("Number of rows with missing values in column 'stalk-root': {}".format(df_mushrooms[df_mushrooms['stalk-root'] == '?'].shape[0]))
df_mushrooms_dropped_rows = df_mushrooms[df_mushrooms['stalk-root'] != '?']
In [10]:
drop_percentage = 0.8
df_mushrooms_dropped_cols = df_mushrooms.copy(deep=True)
df_mushrooms_dropped_cols.loc[df_mushrooms_dropped_cols['stalk-root'] == '?', 'stalk-root'] = np.nan
for col in df_mushrooms_dropped_cols.columns.values:
no_rows = df_mushrooms_dropped_cols[col].isnull().sum()
percentage = no_rows / df_mushrooms_dropped_cols.shape[0]
if percentage > drop_percentage:
del df_mushrooms_dropped_cols[col]
print("Column {} contains {} missing values. This is {} percent. Dropping this column.".format(col, no_rows, percentage))
In [12]:
df_mushrooms_zerofill = df_mushrooms.copy(deep = True)
df_mushrooms_zerofill.loc[df_mushrooms_zerofill['stalk-root'] == '?', 'stalk-root'] = np.nan
df_mushrooms_zerofill.fillna(0, inplace=True)
In [11]:
df_mushrooms_bfill = df_mushrooms.copy(deep = True)
df_mushrooms_bfill.loc[df_mushrooms_bfill['stalk-root'] == '?', 'stalk-root'] = np.nan
df_mushrooms_bfill.fillna(method='bfill', inplace=True)
In [13]:
df_mushrooms_ffill = df_mushrooms.copy(deep = True)
df_mushrooms_ffill.loc[df_mushrooms_ffill['stalk-root'] == '?', 'stalk-root'] = np.nan
df_mushrooms_ffill.fillna(method='ffill', inplace=True)
When it comes to columns with categorical data, you can do two things.
Example:
Let assume that we have a column called 'FRUIT' with unique values ['ORANGE', 'APPLE', PEAR'].
When using the first method, you should pay attention to the fact that some classifiers will try to make sense of the numerical value of the one-hot encoded column. For example the Nearest neighbour algorithm assumes that the value 1 is closer to 0 than the value 2. But the numerical values have no meaning in the case of one-hot encoded columns (an APPLE is not closer to an ORANGE than a PEAR is.). And the results therefore can be misleading.
In [14]:
df_mushrooms_ohe = df_mushrooms.copy(deep=True)
to_be_encoded_cols = df_mushrooms_ohe.columns.values
label_encode(df_mushrooms_ohe, to_be_encoded_cols)
display(df_mushrooms_ohe.head())
In [15]:
## Now lets do the same thing for the other dataframes
df_mushrooms_dropped_rows_ohe = df_mushrooms_dropped_rows.copy(deep = True)
df_mushrooms_zerofill_ohe = df_mushrooms_zerofill.copy(deep = True)
df_mushrooms_bfill_ohe = df_mushrooms_bfill.copy(deep = True)
df_mushrooms_ffill_ohe = df_mushrooms_ffill.copy(deep = True)
label_encode(df_mushrooms_dropped_rows_ohe, to_be_encoded_cols)
label_encode(df_mushrooms_zerofill_ohe, to_be_encoded_cols)
label_encode(df_mushrooms_bfill_ohe, to_be_encoded_cols)
label_encode(df_mushrooms_ffill_ohe, to_be_encoded_cols)
In [16]:
y_col = 'class'
to_be_expanded_cols = list(df_mushrooms.columns.values)
to_be_expanded_cols.remove(y_col)
df_mushrooms_expanded = df_mushrooms.copy(deep=True)
label_encode(df_mushrooms_expanded, [y_col])
expand_columns(df_mushrooms_expanded, to_be_expanded_cols)
display(df_mushrooms_expanded.head())
In [17]:
## Now lets do the same thing for all other dataframes
df_mushrooms_dropped_rows_expanded = df_mushrooms_dropped_rows.copy(deep = True)
df_mushrooms_zerofill_expanded = df_mushrooms_zerofill.copy(deep = True)
df_mushrooms_bfill_expanded = df_mushrooms_bfill.copy(deep = True)
df_mushrooms_ffill_expanded = df_mushrooms_ffill.copy(deep = True)
label_encode(df_mushrooms_dropped_rows_expanded, [y_col])
label_encode(df_mushrooms_zerofill_expanded, [y_col])
label_encode(df_mushrooms_bfill_expanded, [y_col])
label_encode(df_mushrooms_ffill_expanded, [y_col])
expand_columns(df_mushrooms_dropped_rows_expanded, to_be_expanded_cols)
expand_columns(df_mushrooms_zerofill_expanded, to_be_expanded_cols)
expand_columns(df_mushrooms_bfill_expanded, to_be_expanded_cols)
expand_columns(df_mushrooms_ffill_expanded, to_be_expanded_cols)
We have seen that there are two different ways to handle columns with categorical data, and many different ways to handle missing values.
Since computation power is cheap, it is easy to try out all of these ways on all of the classifiers present in the scikit-learn.
After we have seen which method and which classifier has the highest accuracy initially we can continue in that direction.
In [18]:
dict_dataframes = {
"df_mushrooms_ohe": df_mushrooms_ohe,
"df_mushrooms_dropped_rows_ohe": df_mushrooms_dropped_rows_ohe,
"df_mushrooms_zerofill_ohe": df_mushrooms_zerofill_ohe,
"df_mushrooms_bfill_ohe": df_mushrooms_bfill_ohe,
"df_mushrooms_ffill_ohe": df_mushrooms_ffill_ohe,
"df_mushrooms_expanded": df_mushrooms_expanded,
"df_mushrooms_dropped_rows_expanded": df_mushrooms_dropped_rows_expanded,
"df_mushrooms_zerofill_expanded": df_mushrooms_zerofill_expanded,
"df_mushrooms_bfill_expanded": df_mushrooms_bfill_expanded,
"df_mushrooms_ffill_expanded": df_mushrooms_ffill_expanded
}
In [19]:
y_col = 'class'
train_test_ratio = 0.7
for df_key, df in dict_dataframes.items():
x_cols = list(df.columns.values)
x_cols.remove(y_col)
df_train, df_test, X_train, Y_train, X_test, Y_test = get_train_test(df, y_col, x_cols, train_test_ratio)
dict_models = batch_classify(X_train, Y_train, X_test, Y_test, no_classifiers = 8, verbose=False)
print()
print(df_key)
display_dict_models(dict_models)
print("-------------------------------------------------------")
After you have determined with a quick and dirty method type of filling missing values and which classifier performs best for your dataset, you can improve upon the Classifier by optimizing its hyperparameters.
Since the mushroom dataset already has a high accuracy on the test set, there is not much to improve upon. So demonstrate hyperparameter optimization we'll use the glass dataset again.
In [20]:
GDB_params = {
'n_estimators': [100, 500, 1000],
'learning_rate': [0.5, 0.1, 0.01, 0.001],
'criterion': ['friedman_mse', 'mse', 'mae']
}
df_train, df_test, X_train, Y_train, X_test, Y_test = get_train_test(df_glass, y_col_glass, x_cols_glass, 0.6)
for n_est in GDB_params['n_estimators']:
for lr in GDB_params['learning_rate']:
for crit in GDB_params['criterion']:
clf = GradientBoostingClassifier(n_estimators=n_est,
learning_rate = lr,
criterion = crit)
clf.fit(X_train, Y_train)
train_score = clf.score(X_train, Y_train)
test_score = clf.score(X_test, Y_test)
print("For ({}, {}, {}) - train, test score: \t {:.5f} \t-\t {:.5f}".format(n_est, lr, crit[:4], train_score, test_score))
Some datasets contain a lot of features / columns, and it is not immediatly clear which of these features are helping with the Classification / Regression, and which of these features are only adding more noise.
To have a better understanding of this, you could make a correlation matrix of the data, and plot all features by descending order of correlation value.
In [21]:
correlation_matrix = df_glass.corr()
plt.figure(figsize=(10,8))
ax = sns.heatmap(correlation_matrix, vmax=1, square=True, annot=True,fmt='.2f', cmap ='GnBu', cbar_kws={"shrink": .5}, robust=True)
plt.title('Correlation matrix between the features', fontsize=20)
plt.show()
In [22]:
display_corr_with_col(df_glass, 'Type')
The Cumulative explained variance shows how much of the variance is captures by the first x features.
Below we can see that the first 4 features (i.e. the four features with the largest correlation) already capture 90% of the variance.
If you have low accuracy values for your Regression / Classification model, you could decide to stepwise remove the features with the lowest correlation, (or stepwise add features with the highest correlation).
In [23]:
X = df_glass[x_cols_glass].values
X_std = StandardScaler().fit_transform(X)
pca = PCA().fit(X_std)
var_ratio = pca.explained_variance_ratio_
components = pca.components_
#print(pca.explained_variance_)
plt.plot(np.cumsum(var_ratio))
plt.xlim(0,9,1)
plt.xlabel('Number of Features', fontsize=16)
plt.ylabel('Cumulative explained variance', fontsize=16)
plt.show()
In [24]:
ax = sns.pairplot(df_glass, hue='Type')
plt.title('Pairwise relationships between the features')
plt.show()
In [ ]: