In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.family']='SimHei' #顯示中文
%matplotlib inline
In [2]:
import warnings
warnings.filterwarnings('ignore')
In [3]:
train = pd.read_csv('input/train.csv', encoding = "utf-8", dtype = {'type': np.int32})
test = pd.read_csv('input/test.csv', encoding = "utf-8")
In [4]:
#把示範用的 type 4, 資料去除, 以免干擾建模
train = train[train['type']!=4]
In [5]:
from sklearn.model_selection import train_test_split
X = train[['花瓣寬度','花瓣長度','花萼寬度','花萼長度']]
y = train['type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=100)
In [6]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
In [7]:
from sklearn.svm import SVC
from sklearn import metrics
svc = SVC(C=1.0, kernel="rbf", probability=True)
svc.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, svc.predict(X_test_std)))
print(metrics.confusion_matrix(y_test, svc.predict(X_test_std)))
In [8]:
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf','poly'], 'gamma': [1e-3, 1e-4],
'C': [1, 10, 100, 1000]},
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
scores = ['precision', 'recall', 'f1']
#F1 = 2 * (precision * recall) / (precision + recall)
In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
for score in scores:
print("# Tuning hyper-parameters for < %s > -------------------------" % score)
print()
clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
scoring='%s_macro' % score)
clf.fit(X_train, y_train)
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print("mean_test_score (+/- std_test_score) for { paramaters..}")
print("")
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r"
% (mean, std * 2, params))
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()
In [10]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
h = .02 # step size in the mesh
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
"Random Forest", "AdaBoost", "Naive Bayes", "Linear Discriminant Ana.",
"Quadratic Discriminant Ana.", "XGBoost"]
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025, probability=True),
SVC(kernel='rbf',C=100, gamma=0.001, probability=True),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
AdaBoostClassifier(),
GaussianNB(),
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis(),
XGBClassifier(n_estimators= 2000, max_depth= 4)]
In [11]:
X = train[['花瓣寬度','花瓣長度','花萼寬度','花萼長度']]
y = train['type']
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=100)
test_std = sc.transform(test[['花瓣寬度','花瓣長度','花萼寬度','花萼長度']])
# iterate over classifiers
for name, clf in zip(names, classifiers):
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
print("%r score: %0.04f" % (name, score))
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print(clf.predict(test_std))
print("--------------------------------------------------------------")
In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import xgboost as xgb
clf1 = KNeighborsClassifier(n_neighbors=3, weights='uniform')
clf2 = RandomForestClassifier(n_estimators=500, criterion='gini', max_features='auto', oob_score=True)
clf3 = GaussianNB()
clf4 = SVC(kernel='rbf',C=100, gamma=0.001, probability=True)
clf5 = AdaBoostClassifier()
clf6 = LinearDiscriminantAnalysis()
clf7 = QuadraticDiscriminantAnalysis()
clf8 = xgb.XGBClassifier(n_estimators= 2000, max_depth= 4)
meta_clf = LinearDiscriminantAnalysis()
stacking_clf1 = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8], meta_classifier=meta_clf)
clf1.fit(X_train_std, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train_std, y_train)
clf4.fit(X_train_std, y_train)
clf5.fit(X_train_std, y_train)
clf6.fit(X_train_std, y_train)
clf7.fit(X_train_std, y_train)
clf8.fit(X_train_std, y_train)
stacking_clf1.fit(X_train_std, y_train)
print('KNN Score:',clf1.score(X_test_std, y_test))
print('RF Score:',clf2.score(X_test, y_test))
print('GNB Score:',clf3.score(X_test_std, y_test))
print('SVC Score:',clf4.score(X_test_std, y_test))
print('AdaBoost Score:',clf5.score(X_test_std, y_test))
print('L-Discr. Score:',clf6.score(X_test_std, y_test))
print('Q-Discr. Score:',clf7.score(X_test_std, y_test))
print('XGBoost Score:',clf8.score(X_test_std, y_test))
print('Stacking Score:',stacking_clf1.score(X_test_std, y_test))
print("--------------------------------------------------------------")
print(stacking_clf1.predict(test_std))
In [13]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import xgboost as xgb
clf1 = LinearDiscriminantAnalysis()
clf2 = QuadraticDiscriminantAnalysis()
clf3 = xgb.XGBClassifier(n_estimators= 2000, max_depth= 4)
meta_clf = LinearDiscriminantAnalysis()
stacking_clf2 = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=meta_clf)
clf1.fit(X_train_std, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train_std, y_train)
stacking_clf2.fit(X_train_std, y_train)
print('L-Discr. Score:',clf1.score(X_test_std, y_test))
print('Q-Discr. Score:',clf2.score(X_test_std, y_test))
print('XGBoost Score:',clf3.score(X_test_std, y_test))
print('Stacking Score:',stacking_clf2.score(X_test_std, y_test))
print("--------------------------------------------------------------")
print(stacking_clf2.predict(test_std))
In [14]:
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import xgboost as xgb
clf1 = KNeighborsClassifier(n_neighbors=3, weights='uniform')
clf2 = RandomForestClassifier(n_estimators=500, criterion='gini', max_features='auto', oob_score=True)
clf3 = GaussianNB()
clf4 = SVC(kernel='rbf',C=100, gamma=0.001, probability=True)
clf5 = AdaBoostClassifier()
clf6 = LinearDiscriminantAnalysis()
clf7 = QuadraticDiscriminantAnalysis()
clf8 = xgb.XGBClassifier(n_estimators= 2000, max_depth= 4)
eclf1 = VotingClassifier(estimators=[('knn', clf1), ('rfc', clf2), ('gnb', clf3), ('svc', clf4),
('Ada', clf5), ('Lda', clf6), ('Qda', clf7), ('XGB', clf8)],
voting='hard', weights=[1, 1, 1, 1, 1, 1, 1, 1])
eclf1.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, eclf1.predict(X_test_std)))
print("--------------------------------------------------------------")
print(eclf1.predict(test_std))
In [15]:
from sklearn.ensemble import VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import xgboost as xgb
clf1 = LinearDiscriminantAnalysis()
clf2 = QuadraticDiscriminantAnalysis()
clf3 = xgb.XGBClassifier(n_estimators= 2000, max_depth= 4)
eclf2 = VotingClassifier(estimators=[('Lda', clf1), ('Qda', clf2), ('XGB', clf3)],
voting='hard', weights=[1, 1, 1])
eclf2.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, eclf2.predict(X_test_std)))
print("--------------------------------------------------------------")
print(eclf2.predict(test_std))
In [16]:
# Python機器學習與深度學習實作 課程教材
# Scikit-Learn 官網作圖函式
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
plt.figure(figsize=(10,6)) #調整作圖大小
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
In [17]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=10, random_state=None, shuffle=True)
estimator = SVC(C=1.0, kernel="rbf", probability=True)
plot_learning_curve(estimator, "SVM", X_train_std, y_train, cv=cv, train_sizes=np.linspace(0.2, 1.0, 5))
Out[17]:
In [18]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=10, random_state=None, shuffle=True)
estimator = LinearDiscriminantAnalysis()
plot_learning_curve(estimator, "Linear Discriminant Analysis", X_train_std, y_train, cv=cv, train_sizes=np.linspace(0.2, 1.0, 5))
Out[18]:
In [19]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=10, random_state=None, shuffle=True)
estimator = QuadraticDiscriminantAnalysis()
plot_learning_curve(estimator, "Quadratic Discriminant Analysis", X_train_std, y_train, cv=cv, train_sizes=np.linspace(0.2, 1.0, 5))
Out[19]:
In [20]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=10, random_state=None, shuffle=True)
estimator = xgb.XGBClassifier(n_estimators= 2000, max_depth= 4)
plot_learning_curve(estimator, "XGBoost", X_train_std, y_train, cv=cv, train_sizes=np.linspace(0.2, 1.0, 5))
Out[20]:
In [21]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=10, random_state=None, shuffle=True)
estimator = eclf1
plot_learning_curve(estimator, "Voting group 1", X_train_std, y_train, cv=cv, train_sizes=np.linspace(0.2, 1.0, 5))
Out[21]:
In [22]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=10, random_state=None, shuffle=True)
estimator = eclf2
plot_learning_curve(estimator, "Voting group 2", X_train_std, y_train, cv=cv, train_sizes=np.linspace(0.2, 1.0, 5))
Out[22]:
In [23]:
from keras.datasets import mnist
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
def create_model(optimizer='adam', init='normal'):
model = Sequential()
model.add(Dense(512, input_dim=4, activation='relu', kernel_initializer=init))
#model.add(Dropout(0.5))
model.add(Dense(256, activation='relu', kernel_initializer=init))
model.add(Dense(64, activation='relu', kernel_initializer=init))
model.add(Dense(3, activation='softmax', kernel_initializer=init))
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
return model
In [25]:
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
# create model
model = KerasClassifier(build_fn=create_model, verbose=0)
# grid search epochs, batch size and optimizer
optimizers = ['rmsprop', 'adam']
init = ['glorot_uniform', 'normal', 'uniform']
epochs = [10, 20, 50]
batches = [5, 10, 20]
param_grid = dict(optimizer=optimizers, epochs=epochs, batch_size=batches, init=init)
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result = grid.fit(X_train_std, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
In [32]:
def c_model(optimizer='adam', init='uniform'):
model = Sequential()
model.add(Dense(512, input_dim=4, activation='relu', kernel_initializer=init))
#model.add(Dropout(0.5))
model.add(Dense(256, activation='relu', kernel_initializer=init))
model.add(Dense(64, activation='relu', kernel_initializer=init))
model.add(Dense(3, activation='softmax', kernel_initializer=init))
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
return model
In [33]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import KFold
cv = KFold(n_splits=10, random_state=None, shuffle=True)
estimator = KerasClassifier(build_fn=c_model, nb_epoch=20, batch_size=5)
plot_learning_curve(estimator, "MLP with Keras", X_train_std, y_train, cv=cv, train_sizes=np.linspace(0.2, 1.0, 5),)
Out[33]:
In [ ]: