In [1]:
    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
    
In [2]:
    
from sklearn import datasets
iris = datasets.load_iris()
    
In [3]:
    
iris.target_names
    
    Out[3]:
In [4]:
    
iris.feature_names
    
    Out[4]:
In [5]:
    
iris.data[50:54]
    
    Out[5]:
In [6]:
    
iris.target
    
    Out[6]:
In [7]:
    
# Create Dataframe from iris feature data with column name
df = pd.DataFrame(iris.data, columns=['sepal_l', 'sepal_w', 'petal_l', 'petal_w'])
df.head()
    
    Out[7]:
In [8]:
    
# check if there are Nan value in dataframe of iris data
# df.info() is good alternative
df.notnull().sum()
    
    Out[8]:
In [9]:
    
# display statisc data (mean, std, min, max ...) of dataframe
df.describe()
# mean(), std(), min(), max() function returns each value
    
    Out[9]:
In [10]:
    
# prediction: assume that we already found proper a0, a1, a2 by fitting
#predict(x1,x2) = a0 + a1*x1 + a2*x2  = (a1*x1+a0_1) + (a2*x2+a0_2),  a0_1 + a0_2 = a0
    
In [11]:
    
# concatenate or merge target data to a new column (same result)
df2 = pd.concat([df, pd.DataFrame(iris.target, columns='target')], axis=1)
#df2 = df.merge(pd.DataFrame(iris.target, columns=['target']),left_index=True, right_index=True)
df2.head()
    
    Out[11]:
In [12]:
    
# Select target:0 and crete new dataframe
# step-1, retrieve target column as a series
#df2['target']
# another syntax
df2.target
# step-2, perfrom logical operation on the series, result: series of logical value
df2['target'] == 0
# step-3, use above array of logical value to select records(row)
iris0 = df2[df2.target==0]
iris1 = df2[df2.target==1]
iris2 = df2[df2.target==2]
    
In [13]:
    
iris0.describe()
    
    Out[13]:
In [14]:
    
iris1.describe()
    
    Out[14]:
In [15]:
    
iris2.describe()
    
    Out[15]:
In [41]:
    
# by groupby method, above comparison can be showen in 1 statement
df2.groupby('target').describe().T  # .T for toggle output direction
    
    Out[41]:
In [16]:
    
# splitting training and testing data using random permutation
# 50 for testing, 100 for training
rnd_index = np.random.permutation(150)
test_idx = rnd_index[:50]
train_idx = rnd_index[50:]
# print(train_idx, test_idx)
# select data using iloc method
X_test = df.iloc[test_idx]
X_train = df.iloc[train_idx]
#X_test.describe()
#X_train.describe()
# define DataFrame of LABEL
df_target = pd.DataFrame(iris.target, columns=['target'])
# we can use same index for y (LABEL) ; expected result
y_test = df_target.iloc[test_idx]
y_train = df_target.iloc[train_idx]
y_test.head()
    
    Out[16]:
In [17]:
    
X_test.head()
    
    Out[17]:
In [18]:
    
# Split data with sklearn test_data_split
from sklearn.model_selection import train_test_split
# train:test = 70:30 - 80:20
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4)
    
In [19]:
    
# It's possible to use pandas DF as input data
#ser_target = pd.Series(iris.target)
#from sklearn.utils.validation import column_or_1d
#X_train, X_test, y_train, y_test = train_test_split(df, ser_target, test_size=0.2)
# or use df.values attribute which is numpy array of the data of DataFrame
#X_train, X_test, y_train, y_test = train_test_split(df.values, ser_target.values, test_size=0.25)
    
In [20]:
    
print('Size of training:', len(X_train), len(y_train), '  Size of testing:', len(X_test), len(y_test))
    
    
In [21]:
    
# apply logistic regression
from sklearn.linear_model import LogisticRegression # We can choose other algorithm easily
clf = LogisticRegression() # We can override parameter
from sklearn.linear_model import LogisticRegressionCV # We can choose other algorithm easily
#clf = LogisticRegressionCV(Cs=[0.1, 1, 10, 100,1000], solver='newton-cg', max_iter=10000) # We can override parameters
from sklearn.tree import DecisionTreeClassifier
#clf = DecisionTreeClassifier()
from sklearn.neighbors import KNeighborsClassifier
#clf = KNeighborsClassifier()
from sklearn.neural_network import MLPClassifier
#clf = MLPClassifier()
from sklearn.svm import SVC
#clf = SVC()
clf.fit(X_train, y_train)  # fit model: adjust parameter
#svc.fit(X_train, y_train)  # fit model: adjust parameter
# measure the accuracy on testing data
p_test = clf.predict(X_test)
#p_svc = clf.predict(X_test)
print('Predicted', p_test)
print('Expected ', y_test)
#print('Pred SVC', p_svc)
clf  # check which parameter can be specified
    
    
    
    Out[21]:
In [22]:
    
clf.coef_
    
    Out[22]:
https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
In [23]:
    
# Calculate accuracy of predicted testing data. (accuracy: number_of_correct_answer/number_of_testing_data)
match_count = 0
for i in range(len(p_test)):  #loop from 0 to 30-1 (total 30 loop)
    if p_test[i] == y_test[i]:
        match_count += 1
print(match_count / len(p_test))
    
    
In [24]:
    
(y_test == p_test).mean()
    
    Out[24]:
In [25]:
    
from sklearn.metrics import accuracy_score
accuracy_score(y_test, p_test)
    
    Out[25]:
In [26]:
    
# Calculate accuracy of predicted training data
p_train = clf.predict(X_train)
(y_train == p_train).mean()
    
    Out[26]:
https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html https://scikit-learn.org/stable/modules/cross_validation.html
See also Corsera machine learning, lecture-10.
In [27]:
    
# Separate Training data to Training + Cross Validation
# Train: CV: Testing = 60:20:20
X_tr, X_cv, y_tr, y_cv = train_test_split(X_train, y_train, test_size=0.25)
train_size = list(range(len(y_tr) // 10, len(y_tr) + 1, len(y_tr) // 10))
train_score = list()
cv_score = list()
for t_size in train_size:
    #print('fitting with training size:', t_size)
    clf.fit(X_tr[:t_size], y_tr[:t_size])
    p_tr = clf.predict(X_tr[:t_size])
    p_cv = clf.predict(X_cv)
    train_score.append(round((y_tr[:t_size] == p_tr).mean(), 3))
    cv_score.append(round((y_cv == p_cv).mean(), 3))
    #print('Trainign score:', round((y_tr == p_tr).mean(),3), end='  ')
    #print('CV score:', round((y_cv == p_cv).mean(),3))
    
    
In [28]:
    
# Modified version, test 10 times
# Separate Training data to Training + Cross Validation
# Train: CV: Testing = 60:20:20
X_tr, X_cv, y_tr, y_cv = train_test_split(X_train, y_train, test_size=0.25)
train_size = list(range(len(y_tr) // 10, len(y_tr) + 1, len(y_tr) // 10))
train_score = list()
cv_score = list()
for t_size in train_size:
    #print('fitting with training size:', t_size)
    tr_result = list()
    cv_result = list()
    for i in range(1):
        clf.fit(X_tr[:t_size], y_tr[:t_size])
        p_tr = clf.predict(X_tr)
        p_cv = clf.predict(X_cv)
        tr_result.append((y_tr == p_tr).mean())
        cv_result.append((y_cv == p_cv).mean())
    train_score.append(round(np.mean(tr_result), 3))
    cv_score.append(round(np.mean(tr_result) , 3))
    #print('Trainign score:', round((y_tr == p_tr).mean(),3), end='  ')
    #print('CV score:', round((y_cv == p_cv).mean(),3))
    
    
In [29]:
    
train_size
    
    Out[29]:
In [45]:
    
# plot learning curve
#plt.hold(True)
plt.figure()
plt.title('Learning curve')
plt.xlabel('Number of training data')
plt.ylabel('Accuracy')
plt.plot(train_size, train_score, 'k--', label='training score')
plt.plot(train_size, cv_score, 'r-', label='CV score')
plt.legend(loc='best')
plt.show()
    
    
In [31]:
    
# Learning curve using the sample of scikit-learn
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.
    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.
    title : string
        Title for the chart.
    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.
    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.
    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.
    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.
        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.
        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.
    n_jobs : int or None, optional (default=None)
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.
    train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    plt.legend(loc="best")
    return plt
title = "Learning Curves (Logistic regression)"
# cv = ShuffleSplit(n_splits=2, test_size=0.25, random_state=0)
cv = ShuffleSplit(n_splits=5, test_size=0.25)
plot_learning_curve(clf, title, X_train, y_train, cv=cv, n_jobs=4)
plt.show()
    
    
In [49]:
    
df['petal_w2'] = df['petal_w'] * df['petal_w']
df['sepal_r'] = df['sepal_l'] / df['sepal_w']
df['petal_r'] = df['petal_l'] / df['petal_w']
df['petal_r2'] = df['petal_r'] * df['petal_r']
df['sepal_size'] = df['sepal_l'] * df['sepal_w']
df['petal_size'] = df['petal_l'] * df['petal_w']
df['size_ratio'] = df['sepal_size'] / df['petal_size']
df['sepal2'] = df['sepal_l'] * 2  # non-sense to add constant * original data
#df['ratio_ratio'] = df['sepal_r'] / df['petal_r']
print(df[40:45], '\n\n')
print(df[70:75], '\n\n')
print(df[115:120])
    
    
In [33]:
    
df[:50].describe()
    
    Out[33]:
In [34]:
    
df[50:100].describe()
    
    Out[34]:
In [35]:
    
df[100:].describe()
    
    Out[35]:
In [36]:
    
features = ['petal_r', 'sepal_r', 'sepal_size', 'petal_size', 'size_ratio']
features = ['sepal_w', 'petal_l', 'petal_w', 'petal_w2', 'petal_r', 'sepal_r', 'size_ratio']
dfx = df[features]
dfx.head()
    
    Out[36]:
In [37]:
    
# Normalize dfx, its data range is very diffrent between columns
# same operation can be performed using sklearn preprocessing (StandardScaler)
#dfn = (dfx - dfx.min() + 0.0000001) / (dfx.max() - dfx.min())
#dfn = (df - df.min() + 0.0000001) / (df.max() - df.min())
dfn = (dfx - dfx.mean()) / dfx.std()
dfn.describe()  # Mean:0 Std:1
    
    Out[37]:
In [38]:
    
def test_n_time(clf, n, test_ratio = 0.3, normalize=True):
    cum_acc = []
    if normalize: dataframe = dfn
    else: dataframe = df
        
    for i in range(n):
        X_train, X_test, y_train, y_test = train_test_split(dataframe, iris.target, test_size=test_ratio)
        #clf = LogisticRegression() # We can override parameter
        #clf = SVC()
        #clf = KNeighborsClassifier()
        clf.fit(X_train, y_train)  # fit model: adjust parameter
        # measure the accuracy on testing data
        p_test = clf.predict(X_test)
        #print('Predict ', p_test)
        #print('Expected', y_test)
        accuracy = (y_test == p_test).mean()
        #print(accuracy)
        cum_acc.append(accuracy)
        #print(list(X_test.index[y_test != p_test]))
    return cum_acc
    
In [39]:
    
#clf = LogisticRegression()
#clf = KNeighborsClassifier()
clf = SVC()
test_count = 150
result = test_n_time(clf, test_count)
print('Average accuracy of', test_count, 'time:', sum(result) / test_count)
#print(result)
print('min:', min(result), 'max:' ,max(result), 'std:', np.std(result))
    
    
    
    
In [40]:
    
## Show probalirity of each records
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
clf = LogisticRegression()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=1)
clf.fit(X_train, y_train)
prob_lr = clf.predict_proba(X_test)
p_test_lr = clf.predict(X_test)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
prob_rf = clf.predict_proba(X_test)
p_test_rf = clf.predict(X_test)
for i in range(30):
    print(prob_lr[i], prob_rf[i], p_test_lr[i], p_test_rf[i], y_test[i])
    
    
    
In [ ]: