kaggle-london
ref. https://www.kaggle.com/c/data-science-london-scikit-learn/discussion/34115
In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['font.family']='SimHei' #顯示中文(for Mac)
plt.rcParams['axes.unicode_minus']=False #正常顯示負號
In [2]:
import warnings
warnings.filterwarnings('ignore')
In [3]:
#### READING OUR GIVEN DATA INTO PANDAS DATAFRAME ####
x_train = pd.read_csv('./input/train.csv',header=None)
y_train = pd.read_csv('./input/trainLabels.csv',header=None)
x_test = pd.read_csv('./input/test.csv',header=None)
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)
x_test = np.asarray(x_test)
y_train = y_train.ravel()
print('training_x Shape:',x_train.shape,
',training_y Shape:',y_train.shape,
',testing_x Shape:',x_test.shape )
print('training_x len:',len(x_train),
',training_y len:',len(y_train),
',testing_x len:',len(x_test) )
In [4]:
print(x_train[0:2])
In [5]:
print(x_test[0:2])
In [6]:
#Checking the models
x_all = np.r_[x_train,x_test]
print('x_all shape :',x_all.shape)
In [7]:
#### USING THE GAUSSIAN MIXTURE MODEL ####
# gmm.aic():Akaike information criterion for the current model on the input X.
# gmm.bic():Bayesian information criterion for the current model on the input X
# aic & bic (return float type): value, The lower the better
from sklearn.mixture import GaussianMixture
lowest_bic = np.infty
bic = []
n_components_range = range(1, 7)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
for n_components in n_components_range:
# Fit a mixture of Gaussians with EM
gmm = GaussianMixture(n_components=n_components,covariance_type=cv_type)
gmm.fit(x_all)
bic.append(gmm.bic(x_all)) #use aic() or bic()
if bic[-1] < lowest_bic:
lowest_bic = bic[-1]
best_gmm = gmm
best_gmm.fit(x_all)
x_train = best_gmm.predict_proba(x_train)
x_test = best_gmm.predict_proba(x_test)
In [8]:
print(x_train[0:2])
In [9]:
print(x_test[0:2])
In [10]:
x_train[0:2,0]
Out[10]:
In [11]:
import seaborn as sns
from scipy import stats
from scipy.stats import norm
#histogram and normal probability plot
plt.figure(figsize=(12, 14))
plt.rcParams['axes.unicode_minus']=False #正常顯示負號
plt.rcParams['font.size']=8
plt.subplot(4,2,1)
plt.title('x_train[:,0]')
sns.distplot(x_train[:,0], fit=norm);
plt.subplot(4,2,2)
res = stats.probplot(x_train[:,0], plot=plt)
plt.subplot(4,2,3)
plt.title('x_train[:,1]')
sns.distplot(x_train[:,1], fit=norm);
plt.subplot(4,2,4)
res = stats.probplot(x_train[:,1], plot=plt)
plt.subplot(4,2,5)
plt.title('x_train[:,2]')
sns.distplot(x_train[:,2], fit=norm);
plt.subplot(4,2,6)
res = stats.probplot(x_train[:,2], plot=plt)
plt.subplot(4,2,7)
plt.title('x_train[:,3]')
sns.distplot(x_train[:,3], fit=norm);
plt.subplot(4,2,8)
res = stats.probplot(x_train[:,3], plot=plt)
In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import VotingClassifier
#from sklearn import svm
from sklearn.cross_validation import cross_val_score
In [13]:
#### TAKING ONLY TWO MODELS FOR KEEPING IT SIMPLE ####
knn = KNeighborsClassifier()
rf = RandomForestClassifier()
In [14]:
param_grid = dict( )
#### GRID SEARCH for BEST TUNING PARAMETERS FOR KNN #####
grid_search_knn = GridSearchCV(knn,param_grid=param_grid,cv=10,scoring='accuracy').fit(x_train,y_train)
print('best estimator KNN:',grid_search_knn.best_estimator_,
'Best Score', grid_search_knn.best_estimator_.score(x_train,y_train))
knn_best = grid_search_knn.best_estimator_
In [15]:
#### GRID SEARCH for BEST TUNING PARAMETERS FOR RandomForest #####
grid_search_rf = GridSearchCV(rf, param_grid=dict( ), verbose=3,scoring='accuracy',cv=10).fit(x_train,y_train)
print('best estimator RandomForest:',grid_search_rf.best_estimator_,
'Best Score', grid_search_rf.best_estimator_.score(x_train,y_train))
rf_best = grid_search_rf.best_estimator_
knn_best.fit(x_train,y_train)
print("KNN:",knn_best.predict(x_test)[0:10])
rf_best.fit(x_train,y_train)
print("RF:",rf_best.predict(x_test)[0:10])
In [16]:
#### SCORING THE MODELS ####
print('Score for KNN :',cross_val_score(knn_best,x_train,y_train,cv=10,scoring='accuracy').mean())
print('Score for Random Forest :',cross_val_score(rf_best,x_train,y_train,cv=10,scoring='accuracy').max())
In [17]:
### IN CASE WE WERE USING MORE THAN ONE CLASSIFIERS THEN VOTING CLASSIFIER CAN BE USEFUL ###
#clf = VotingClassifier(
# estimators=[('knn_best',knn_best),('rf_best',rf_best)],
# #weights=[871856020222,0.907895269918]
# )
#clf.fit(x_train,y_train)
#print("Votting:",clf.predict(x_test)[0:10])
In [18]:
##### FRAMING OUR SOLUTION #####
knn_best_pred = knn_best.predict(x_test)
rf_best_pred = rf_best.predict(x_test)
#voting_clf_pred = pd.DataFrame(clf.predict(x_test))
In [19]:
row_id = [i for i in range(1,9001)]
In [20]:
# Generate Submission File
StackingSubmission = pd.DataFrame({ 'Id': row_id, 'Solution': rf_best_pred })[["Id","Solution"]]
StackingSubmission.to_csv("submission_PCA_GMM.csv", index=False)
In [21]:
##--aic(Akaike information criterion) LB score: 0.99105
##--bic(Bayesian information criterion) LB score: 0.99143