In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
from sklearn import datasets
iris = datasets.load_iris()
In [3]:
iris.target_names
Out[3]:
In [4]:
iris.feature_names
Out[4]:
In [5]:
iris.data[50:54]
Out[5]:
In [6]:
iris.target
Out[6]:
In [7]:
# Create Dataframe from iris feature data with column name
df = pd.DataFrame(iris.data, columns=['sepal_l', 'sepal_w', 'petal_l', 'petal_w'])
df.head()
Out[7]:
In [8]:
# check if there are Nan value in dataframe of iris data
# df.info() is good alternative
df.notnull().sum()
Out[8]:
In [9]:
# display statisc data (mean, std, min, max ...) of dataframe
df.describe()
# mean(), std(), min(), max() function returns each value
Out[9]:
In [10]:
# prediction: assume that we already found proper a0, a1, a2 by fitting
#predict(x1,x2) = a0 + a1*x1 + a2*x2 = (a1*x1+a0_1) + (a2*x2+a0_2), a0_1 + a0_2 = a0
In [11]:
# concatenate or merge target data to a new column (same result)
df2 = pd.concat([df, pd.DataFrame(iris.target, columns='target')], axis=1)
#df2 = df.merge(pd.DataFrame(iris.target, columns=['target']),left_index=True, right_index=True)
df2.head()
Out[11]:
In [12]:
# Select target:0 and crete new dataframe
# step-1, retrieve target column as a series
#df2['target']
# another syntax
df2.target
# step-2, perfrom logical operation on the series, result: series of logical value
df2['target'] == 0
# step-3, use above array of logical value to select records(row)
iris0 = df2[df2.target==0]
iris1 = df2[df2.target==1]
iris2 = df2[df2.target==2]
In [13]:
iris0.describe()
Out[13]:
In [14]:
iris1.describe()
Out[14]:
In [15]:
iris2.describe()
Out[15]:
In [41]:
# by groupby method, above comparison can be showen in 1 statement
df2.groupby('target').describe().T # .T for toggle output direction
Out[41]:
In [16]:
# splitting training and testing data using random permutation
# 50 for testing, 100 for training
rnd_index = np.random.permutation(150)
test_idx = rnd_index[:50]
train_idx = rnd_index[50:]
# print(train_idx, test_idx)
# select data using iloc method
X_test = df.iloc[test_idx]
X_train = df.iloc[train_idx]
#X_test.describe()
#X_train.describe()
# define DataFrame of LABEL
df_target = pd.DataFrame(iris.target, columns=['target'])
# we can use same index for y (LABEL) ; expected result
y_test = df_target.iloc[test_idx]
y_train = df_target.iloc[train_idx]
y_test.head()
Out[16]:
In [17]:
X_test.head()
Out[17]:
In [18]:
# Split data with sklearn test_data_split
from sklearn.model_selection import train_test_split
# train:test = 70:30 - 80:20
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4)
In [19]:
# It's possible to use pandas DF as input data
#ser_target = pd.Series(iris.target)
#from sklearn.utils.validation import column_or_1d
#X_train, X_test, y_train, y_test = train_test_split(df, ser_target, test_size=0.2)
# or use df.values attribute which is numpy array of the data of DataFrame
#X_train, X_test, y_train, y_test = train_test_split(df.values, ser_target.values, test_size=0.25)
In [20]:
print('Size of training:', len(X_train), len(y_train), ' Size of testing:', len(X_test), len(y_test))
In [21]:
# apply logistic regression
from sklearn.linear_model import LogisticRegression # We can choose other algorithm easily
clf = LogisticRegression() # We can override parameter
from sklearn.linear_model import LogisticRegressionCV # We can choose other algorithm easily
#clf = LogisticRegressionCV(Cs=[0.1, 1, 10, 100,1000], solver='newton-cg', max_iter=10000) # We can override parameters
from sklearn.tree import DecisionTreeClassifier
#clf = DecisionTreeClassifier()
from sklearn.neighbors import KNeighborsClassifier
#clf = KNeighborsClassifier()
from sklearn.neural_network import MLPClassifier
#clf = MLPClassifier()
from sklearn.svm import SVC
#clf = SVC()
clf.fit(X_train, y_train) # fit model: adjust parameter
#svc.fit(X_train, y_train) # fit model: adjust parameter
# measure the accuracy on testing data
p_test = clf.predict(X_test)
#p_svc = clf.predict(X_test)
print('Predicted', p_test)
print('Expected ', y_test)
#print('Pred SVC', p_svc)
clf # check which parameter can be specified
Out[21]:
In [22]:
clf.coef_
Out[22]:
https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
In [23]:
# Calculate accuracy of predicted testing data. (accuracy: number_of_correct_answer/number_of_testing_data)
match_count = 0
for i in range(len(p_test)): #loop from 0 to 30-1 (total 30 loop)
if p_test[i] == y_test[i]:
match_count += 1
print(match_count / len(p_test))
In [24]:
(y_test == p_test).mean()
Out[24]:
In [25]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, p_test)
Out[25]:
In [26]:
# Calculate accuracy of predicted training data
p_train = clf.predict(X_train)
(y_train == p_train).mean()
Out[26]:
https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html https://scikit-learn.org/stable/modules/cross_validation.html
See also Corsera machine learning, lecture-10.
In [27]:
# Separate Training data to Training + Cross Validation
# Train: CV: Testing = 60:20:20
X_tr, X_cv, y_tr, y_cv = train_test_split(X_train, y_train, test_size=0.25)
train_size = list(range(len(y_tr) // 10, len(y_tr) + 1, len(y_tr) // 10))
train_score = list()
cv_score = list()
for t_size in train_size:
#print('fitting with training size:', t_size)
clf.fit(X_tr[:t_size], y_tr[:t_size])
p_tr = clf.predict(X_tr[:t_size])
p_cv = clf.predict(X_cv)
train_score.append(round((y_tr[:t_size] == p_tr).mean(), 3))
cv_score.append(round((y_cv == p_cv).mean(), 3))
#print('Trainign score:', round((y_tr == p_tr).mean(),3), end=' ')
#print('CV score:', round((y_cv == p_cv).mean(),3))
In [28]:
# Modified version, test 10 times
# Separate Training data to Training + Cross Validation
# Train: CV: Testing = 60:20:20
X_tr, X_cv, y_tr, y_cv = train_test_split(X_train, y_train, test_size=0.25)
train_size = list(range(len(y_tr) // 10, len(y_tr) + 1, len(y_tr) // 10))
train_score = list()
cv_score = list()
for t_size in train_size:
#print('fitting with training size:', t_size)
tr_result = list()
cv_result = list()
for i in range(1):
clf.fit(X_tr[:t_size], y_tr[:t_size])
p_tr = clf.predict(X_tr)
p_cv = clf.predict(X_cv)
tr_result.append((y_tr == p_tr).mean())
cv_result.append((y_cv == p_cv).mean())
train_score.append(round(np.mean(tr_result), 3))
cv_score.append(round(np.mean(tr_result) , 3))
#print('Trainign score:', round((y_tr == p_tr).mean(),3), end=' ')
#print('CV score:', round((y_cv == p_cv).mean(),3))
In [29]:
train_size
Out[29]:
In [45]:
# plot learning curve
#plt.hold(True)
plt.figure()
plt.title('Learning curve')
plt.xlabel('Number of training data')
plt.ylabel('Accuracy')
plt.plot(train_size, train_score, 'k--', label='training score')
plt.plot(train_size, cv_score, 'r-', label='CV score')
plt.legend(loc='best')
plt.show()
In [31]:
# Learning curve using the sample of scikit-learn
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
"""
Generate a simple plot of the test and training learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : int, cross-validation generator or an iterable, optional
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 3-fold cross-validation,
- integer, to specify the number of folds.
- :term:`CV splitter`,
- An iterable yielding (train, test) splits as arrays of indices.
For integer/None inputs, if ``y`` is binary or multiclass,
:class:`StratifiedKFold` used. If the estimator is not a classifier
or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.
Refer :ref:`User Guide <cross_validation>` for the various
cross-validators that can be used here.
n_jobs : int or None, optional (default=None)
Number of jobs to run in parallel.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
train_sizes : array-like, shape (n_ticks,), dtype float or int
Relative or absolute numbers of training examples that will be used to
generate the learning curve. If the dtype is float, it is regarded as a
fraction of the maximum size of the training set (that is determined
by the selected validation method), i.e. it has to be within (0, 1].
Otherwise it is interpreted as absolute sizes of the training sets.
Note that for classification the number of samples usually have to
be big enough to contain at least one sample from each class.
(default: np.linspace(0.1, 1.0, 5))
"""
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
title = "Learning Curves (Logistic regression)"
# cv = ShuffleSplit(n_splits=2, test_size=0.25, random_state=0)
cv = ShuffleSplit(n_splits=5, test_size=0.25)
plot_learning_curve(clf, title, X_train, y_train, cv=cv, n_jobs=4)
plt.show()
In [49]:
df['petal_w2'] = df['petal_w'] * df['petal_w']
df['sepal_r'] = df['sepal_l'] / df['sepal_w']
df['petal_r'] = df['petal_l'] / df['petal_w']
df['petal_r2'] = df['petal_r'] * df['petal_r']
df['sepal_size'] = df['sepal_l'] * df['sepal_w']
df['petal_size'] = df['petal_l'] * df['petal_w']
df['size_ratio'] = df['sepal_size'] / df['petal_size']
df['sepal2'] = df['sepal_l'] * 2 # non-sense to add constant * original data
#df['ratio_ratio'] = df['sepal_r'] / df['petal_r']
print(df[40:45], '\n\n')
print(df[70:75], '\n\n')
print(df[115:120])
In [33]:
df[:50].describe()
Out[33]:
In [34]:
df[50:100].describe()
Out[34]:
In [35]:
df[100:].describe()
Out[35]:
In [36]:
features = ['petal_r', 'sepal_r', 'sepal_size', 'petal_size', 'size_ratio']
features = ['sepal_w', 'petal_l', 'petal_w', 'petal_w2', 'petal_r', 'sepal_r', 'size_ratio']
dfx = df[features]
dfx.head()
Out[36]:
In [37]:
# Normalize dfx, its data range is very diffrent between columns
# same operation can be performed using sklearn preprocessing (StandardScaler)
#dfn = (dfx - dfx.min() + 0.0000001) / (dfx.max() - dfx.min())
#dfn = (df - df.min() + 0.0000001) / (df.max() - df.min())
dfn = (dfx - dfx.mean()) / dfx.std()
dfn.describe() # Mean:0 Std:1
Out[37]:
In [38]:
def test_n_time(clf, n, test_ratio = 0.3, normalize=True):
cum_acc = []
if normalize: dataframe = dfn
else: dataframe = df
for i in range(n):
X_train, X_test, y_train, y_test = train_test_split(dataframe, iris.target, test_size=test_ratio)
#clf = LogisticRegression() # We can override parameter
#clf = SVC()
#clf = KNeighborsClassifier()
clf.fit(X_train, y_train) # fit model: adjust parameter
# measure the accuracy on testing data
p_test = clf.predict(X_test)
#print('Predict ', p_test)
#print('Expected', y_test)
accuracy = (y_test == p_test).mean()
#print(accuracy)
cum_acc.append(accuracy)
#print(list(X_test.index[y_test != p_test]))
return cum_acc
In [39]:
#clf = LogisticRegression()
#clf = KNeighborsClassifier()
clf = SVC()
test_count = 150
result = test_n_time(clf, test_count)
print('Average accuracy of', test_count, 'time:', sum(result) / test_count)
#print(result)
print('min:', min(result), 'max:' ,max(result), 'std:', np.std(result))
In [40]:
## Show probalirity of each records
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
clf = LogisticRegression()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=1)
clf.fit(X_train, y_train)
prob_lr = clf.predict_proba(X_test)
p_test_lr = clf.predict(X_test)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
prob_rf = clf.predict_proba(X_test)
p_test_rf = clf.predict(X_test)
for i in range(30):
print(prob_lr[i], prob_rf[i], p_test_lr[i], p_test_rf[i], y_test[i])
In [ ]: