In the past month, we experimented classification on various datasets:
For each datasets, we also explored two type of features:
We also applied various algorithms for the classifications, including:
Today, we want to combine the predictions of several base estimators built with a given learning algorithm in order to improve generalizability / robustness over a single estimator.
We start with introducing the performance of each individual algorithms for the data set that's closest to real data. We will use the simulated data with both white and red noise, three type of planet with realistic planet ratios, and allow the planets to have various periods.
To speed up the process, we performed the data manipulation and feature selection ahead and saved the opitimzied feature into individual file for different algorithms. For details, go through all the other note books in the git repository.
In [1]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.utils import shuffle
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
import matplotlib
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline
Let's introduce the diagnostics we use for each algorithms:
In [2]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(2)
plt.xticks(tick_marks, ['false positives', 'transits'], rotation=45)
plt.yticks(tick_marks, ['false positives', 'transits'])
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
In [3]:
def fit(model,name,data):
trainX,trainY,testX,testY,X,Y=data
model.fit(trainX, trainY)
predY = model.predict(testX)
score = metrics.f1_score(predY, testY)
cvscore= cross_val_score(model, X, Y, cv = 5, scoring = 'f1')
print "#####################################"
print "Result using",model
print "f1 score from train test split %f" % score
print "f1 score from CV5 %f" % np.mean(cvscore)
cm = metrics.confusion_matrix(testY, predY)
plot_confusion_matrix(cm)
print cm
make_ROC_curve(testY,predY,name)
return
In [4]:
def check_FPs(model,testX,testY,testIDs):
predY = model.predict(testX)
index=predY==testY
bins=np.linspace(1000,7000,13)
print bins
plt.hist(testIDs[index],bins=bins)
plt.hist(testIDs[~index],bins=bins)
return
In [5]:
def make_ROC_curve(testY, predY, name):
fig2 = plt.figure()
ax= fig2.add_subplot(1,1,1)
fpr, tpr, _ = roc_curve(testY, predY)
ax.plot(fpr, tpr, label = name)
ax.set_title(('ROC Curve for %s') % name)
ax.set_ylabel('True Positive Rate')
ax.set_xlabel('False Positive Rate')
In [6]:
lc_data=pd.read_csv("data/BL_CR_ALL_features.csv",index_col=0)
lc_data.head()
Out[6]:
In [7]:
X=lc_data.drop('Y',axis=1)
Y=lc_data['Y']
#IDs=lc_darta['ID']
X=StandardScaler().fit_transform(X)
trainX, testX, trainY, testY= train_test_split(X, Y,test_size = 0.2)
In [8]:
model=SVC(gamma=0.005,C=100)
name="SVC"
data=[trainX,trainY,testX,testY,X,Y]
fit(model,name,data)
In [9]:
model=RandomForestClassifier(n_estimators=1000)
name="RFC"
fit(model,name,data)
In [10]:
model=GradientBoostingClassifier(n_estimators=1000)
name="GBC"
fit(model,name,data)
In [11]:
SVM_data=pd.read_csv("data/BL_CR_BLS_SVM_features.csv",index_col=0)
SVM_data.head()
Out[11]:
In [12]:
X=SVM_data.drop(['Y','IDs'],axis=1)
Y=SVM_data['Y']
IDs=SVM_data['IDs']
X=StandardScaler().fit_transform(X)
trainX, testX, trainY, testY,trainIDs,testIDs= train_test_split(X, Y,IDs,test_size = 0.2)
In [13]:
model=SVC(gamma=0.05,C=50)
name="SVC"
data=[trainX,trainY,testX,testY,X,Y]
fit(model,name,data)
In [14]:
check_FPs(model,testX,testY,testIDs)
In [15]:
RFC_data=pd.read_csv("data/BL_CR_BLS_RFC_features.csv",index_col=0)
X=RFC_data.drop(['Y','IDs'],axis=1)
X=StandardScaler().fit_transform(X)
Y=RFC_data['Y']
IDs=RFC_data['IDs']
trainX, testX, trainY, testY,trainIDs,testIDs= train_test_split(X, Y,IDs,test_size = 0.2)
#RFC_data.head()
In [16]:
model=RandomForestClassifier(n_estimators=1000)
name="RFC"
data=[trainX,trainY,testX,testY,X,Y]
fit(model,name,data)
In [17]:
check_FPs(model,testX,testY,testIDs)
In [18]:
GBC_data=pd.read_csv("data/BL_CR_BLS_GBC_features.csv",index_col=0)
X=GBC_data.drop(['Y','IDs'],axis=1)
X=StandardScaler().fit_transform(X)
Y=GBC_data['Y']
IDs=GBC_data['IDs']
trainX, testX, trainY, testY,trainIDs,testIDs= train_test_split(X, Y,IDs,test_size = 0.2)
#GBC_data.head()
In [19]:
model=GradientBoostingClassifier(n_estimators=1000)
name="GBC"
data=[trainX,trainY,testX,testY,X,Y]
fit(model,name,data)
In [20]:
check_FPs(model,testX,testY,testIDs)
In [ ]: