In [1]:
import pandas as pd
import numpy as np
import time
import math
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.ensemble import AdaBoostClassifier as AdaBoost
from sklearn.ensemble import BaggingClassifier as Bagging
from sklearn.ensemble import GradientBoostingClassifier as GBoost
from sklearn.ensemble import VotingClassifier as Voting
from sklearn import preprocessing
import xgboost as xgb
In [2]:
# load data
train = pd.read_csv("train.csv")
print train.shape
In [3]:
train.head()
Out[3]:
In [4]:
# check NaN column by column
print 'gender any null? ' + str(train['gender'].isnull().values.any())
print 'cd any null? ' + str(train['cd'].isnull().values.any())
print 'hd any null? ' + str(train['hd'].isnull().values.any())
print 'age any null? ' + str(train['age'].isnull().values.any())
print 'dbdistance any null? ' + str(train['dbdistance'].isnull().values.any())
print 'vccdistance any null? ' + str(train['vccdistance'].isnull().values.any())
print 'party any null? ' + str(train['party'].isnull().values.any())
print 'racename any null? ' + str(train['racename'].isnull().values.any())
print 'hsonly any null? ' + str(train['hsonly'].isnull().values.any())
print 'mrrg any null? ' + str(train['mrrg'].isnull().values.any())
print 'chldprsnt any null? ' + str(train['chldprsnt'].isnull().values.any())
print 'cath any null? ' + str(train['cath'].isnull().values.any())
print 'evang any null? ' + str(train['evang'].isnull().values.any())
print 'nonchrst any null? ' + str(train['nonchrst'].isnull().values.any())
print 'otherchrst any null? ' + str(train['otherchrst'].isnull().values.any())
print 'days.since.reg any null? ' + str(train['days.since.reg'].isnull().values.any())
#print 'Id any null? ' + str(train['Id'].isnull().values.any())
In [5]:
# fill missing data
predictors_na = ['cd', 'hd', 'dbdistance', 'vccdistance']
train_filled = pd.DataFrame(train)
train_filled[predictors_na] = train_filled[predictors_na].apply(lambda x:x.fillna(x.mean()))
In [6]:
print 'cd any null? ' + str(train_filled['cd'].isnull().values.any())
print 'hd any null? ' + str(train_filled['hd'].isnull().values.any())
print 'dbdistance any null? ' + str(train_filled['dbdistance'].isnull().values.any())
print 'vccdistance any null? ' + str(train_filled['vccdistance'].isnull().values.any())
In [7]:
# one-hot encoding categorical predictors
predictors = ['gender', 'cd', 'hd', 'age', 'dbdistance', 'vccdistance',
'party', 'racename', 'hsonly', 'mrrg', 'chldprsnt', 'cath', 'evang',
'nonchrst', 'otherchrst', 'days.since.reg']
cate = ['gender', 'cd', 'hd', 'party', 'racename', ]
df_x=pd.DataFrame(train_filled[predictors])
df_x.head()
Out[7]:
In [8]:
# one hot encoding 'gender'
onehot = pd.get_dummies(df_x['gender'],prefix='gen')
df_x = df_x.drop('gender', axis=1)
df_x = df_x.join(onehot)
# one hot encoding 'cd'
onehot = pd.get_dummies(df_x['cd'],prefix='cd')
df_x = df_x.drop('cd', axis=1)
df_x = df_x.join(onehot)
# one hot encoding 'hd'
onehot = pd.get_dummies(df_x['hd'],prefix='hd')
df_x = df_x.drop('hd', axis=1)
df_x = df_x.join(onehot)
# one hot encoding 'party'
onehot = pd.get_dummies(df_x['party'],prefix='pty')
df_x = df_x.drop('party', axis=1)
df_x = df_x.join(onehot)
# one hot encoding 'racename'
onehot = pd.get_dummies(df_x['racename'])
df_x = df_x.drop('racename', axis=1)
df_x = df_x.join(onehot)
df_x.head(7)
print df_x.shape
In [9]:
# separate x, y
x=df_x.values
y=train_filled['voted'].values
print x[0:3,:]
print y[0:3]
In [10]:
# Randomly split into train and test sets
my_x_train, my_x_test, my_y_train, my_y_test = train_test_split(x, y, test_size=0.25, random_state=42)
In [11]:
print my_x_train.shape
print my_x_test.shape
In [12]:
# load data
test = pd.read_csv("test.csv")
print test.shape
test.head()
Out[12]:
In [13]:
# check NaN column by column
print 'gender any null? ' + str(test['gender'].isnull().values.any())
print 'cd any null? ' + str(test['cd'].isnull().values.any())
print 'hd any null? ' + str(test['hd'].isnull().values.any())
print 'age any null? ' + str(test['age'].isnull().values.any())
print 'dbdistance any null? ' + str(test['dbdistance'].isnull().values.any())
print 'vccdistance any null? ' + str(test['vccdistance'].isnull().values.any())
print 'party any null? ' + str(test['party'].isnull().values.any())
print 'racename any null? ' + str(test['racename'].isnull().values.any())
print 'hsonly any null? ' + str(test['hsonly'].isnull().values.any())
print 'mrrg any null? ' + str(test['mrrg'].isnull().values.any())
print 'chldprsnt any null? ' + str(test['chldprsnt'].isnull().values.any())
print 'cath any null? ' + str(test['cath'].isnull().values.any())
print 'evang any null? ' + str(test['evang'].isnull().values.any())
print 'nonchrst any null? ' + str(test['nonchrst'].isnull().values.any())
print 'otherchrst any null? ' + str(test['otherchrst'].isnull().values.any())
print 'days.since.reg any null? ' + str(test['days.since.reg'].isnull().values.any())
print 'Id any null? ' + str(test['Id'].isnull().values.any())
In [14]:
# fill missing data
predictors_na = ['cd', 'hd', 'dbdistance', 'vccdistance']
test_filled = pd.DataFrame(test)
test_filled[predictors_na] = test_filled[predictors_na].apply(lambda x:x.fillna(x.mean()))
In [15]:
# one-hot encoding categorical predictors
predictors = ['gender', 'cd', 'hd', 'age', 'dbdistance', 'vccdistance',
'party', 'racename', 'hsonly', 'mrrg', 'chldprsnt', 'cath', 'evang',
'nonchrst', 'otherchrst', 'days.since.reg']
cate = ['gender', 'cd', 'hd', 'party', 'racename', ]
df_test_x=pd.DataFrame(test_filled[predictors])
df_test_x.head()
Out[15]:
In [16]:
# one hot encoding 'gender'
onehot = pd.get_dummies(df_test_x['gender'],prefix='gen')
df_test_x = df_test_x.drop('gender', axis=1)
df_test_x = df_test_x.join(onehot)
# one hot encoding 'cd'
onehot = pd.get_dummies(df_test_x['cd'],prefix='cd')
df_test_x = df_test_x.drop('cd', axis=1)
df_test_x = df_test_x.join(onehot)
# one hot encoding 'hd'
onehot = pd.get_dummies(df_test_x['hd'],prefix='hd')
df_test_x = df_test_x.drop('hd', axis=1)
df_test_x = df_test_x.join(onehot)
# one hot encoding 'party'
onehot = pd.get_dummies(df_test_x['party'],prefix='pty')
df_test_x = df_test_x.drop('party', axis=1)
df_test_x = df_test_x.join(onehot)
# one hot encoding 'racename'
onehot = pd.get_dummies(df_test_x['racename'])
df_test_x = df_test_x.drop('racename', axis=1)
df_test_x = df_test_x.join(onehot)
df_test_x.head(7)
print df_test_x.shape
In [17]:
# separate x
test_x=df_test_x.values
print test_x[0:3,:]
In [25]:
# Testing parameters
num_sample = 50000
# Parameters used for both GBoost and XGBoost
GBoost_stages = 100
GBoost_depth = 5
# Randomly split into train and test sets
my_x_train, my_x_test, my_y_train, my_y_test = train_test_split(x, y, test_size=0.05, random_state=42)
# Slice the first num_sample
my_x_train = my_x_train[0:num_sample,]
my_y_train = my_y_train[0:num_sample,]
# Print data size
print my_x_train.shape
print my_y_train.shape
In [26]:
start_time = time.time()
mGBoost = GBoost(n_estimators=GBoost_stages, max_depth=GBoost_depth)
mGBoost.fit(my_x_train, my_y_train)
Gcompute_time = time.time() - start_time
print "GradBoosting score: " + str(mGBoost.score(my_x_test, my_y_test))
print "Training Time: " + str(Gcompute_time) + "sec"
In [27]:
mGBoost.score(my_x_train, my_y_train)
Out[27]:
In [28]:
mGBoost.classes_
Out[28]:
In [29]:
# Check parameters
print mGBoost
In [30]:
start_time = time.time()
mXGBoost = xgb.XGBClassifier(n_estimators=GBoost_stages, max_depth=GBoost_depth)
mXGBoost.fit(my_x_train, my_y_train)
XGcompute_time = time.time() - start_time
print "XGBoost score: " + str(mXGBoost.score(my_x_test, my_y_test))
print "Compute Time: " + str(XGcompute_time) + "sec"
In [31]:
mXGBoost.score(my_x_train, my_y_train)
Out[31]:
In [32]:
mXGBoost.classes_
Out[32]:
In [117]:
%matplotlib inline
import matplotlib.pyplot as plt
from xgboost import plot_tree
plot_tree(mXGBoost, num_trees=2)
# plt.show()
plt.savefig("xgboost.png")
In [64]:
# Check parameters
print mXGBoost
In [169]:
# compute training set deviance
GBoost_train_acc = np.zeros((GBoost_stages,), dtype=np.float64)
for idx, pred_y_now in enumerate(mGBoost.staged_predict(my_x_train)):
GBoost_train_acc[idx] = float(sum(pred_y_now == my_y_train))/len(my_y_train)
print GBoost_train_acc
In [182]:
# KERNEL PERCEPTRON
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.rcParams.update({'font.size': 14})
# plot the data in feature space
fig, ax = plt.subplots(1, 1, figsize=(7, 4))
ax.set_xlabel('Time (ms)')
ax.set_ylabel('Accuracy')
ax.set_title('Accuracy vs Time')
#ax.set_xticks(range(0, GBoost_stages+5, 5))
#ax.set_ylim([0.65, 0.75])
Gtick = np.array(range(GBoost_stages))*(Gcompute_time/GBoost_stages*1000)
#XGtick = np.array(range(GBoost_stages))*(Gcompute_time/GBoost_stages*1000)
ax.plot(Gtick, GBoost_train_acc, 'b-*', label='GBoost')
#ax.plot(XGtick,mXGBoost.train_score_, 'g-', label='XGBoost')
#ax.plot(range(len(loss_K20_R3)), loss_K20_R3/6000, 'r-', label='K=20,Run3')
#ax.plot(range(len(loss_K20_R3)), loss_K20_R3/6000, 'k-', label='K=20,Run3')
ax.legend(loc=2)
ax.grid()
plt.tight_layout()
plt.show()
In [21]:
# --------------
# A generic function to do CV
# --------------
def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None):
if score_func:
gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
else:
gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
gs.fit(X, y)
best = gs.best_estimator_
return best
In [14]:
# --------------
# Optimize n_estimators, max_depth in random forest
# --------------
dt_rforest = ensemble.RandomForestClassifier(max_depth=3)
rforest = dt_rforest.fit(my_x_train, my_y_train)
parameters = {'max_depth': [12, 14, 16, 18, 20, 22, 24], 'n_estimators': [20, 25, 30, 35, 40]}
rforest = cv_optimize(rforest, parameters, my_x_train, my_y_train, n_jobs=10, n_folds=5, score_func=None)
print 'Optimal n_estimators: ' + str(rforest.n_estimators)
print 'Optimal max_depth: ' + str(rforest.max_depth)
In [15]:
# --------------
# Random Forest
# --------------
start_time = time.time()
dt_rforest = ensemble.RandomForestClassifier(max_depth = 18, n_estimators = 40)
dt_rforest.fit(my_x_train, my_y_train)
compute_time = time.time() - start_time
print "Random forest score: " + str(dt_rforest.score(my_x_test, my_y_test))
print "Compute Time", compute_time
In [16]:
dt_rforest.score(my_x_train, my_y_train)
Out[16]:
In [68]:
dt_rforest.classes_
Out[68]:
In [82]:
pred_y = dt_rforest.predict_proba(test_x)
print pred_y[0:5,:]
In [89]:
pred_y = dt_KNN.predict_proba(test_x)
print pred_y[0:5,:]
In [94]:
pred_y = mGBoost.predict_proba(test_x)
print pred_y[0:5,:]
In [100]:
pred_y = mBag.predict_proba(test_x)
print pred_y[0:5,:]
In [101]:
df_pred = pd.DataFrame(test_filled['Id'])
df_pred['voted'] = pd.Series(pred_y[:,1], index=df_pred.index)
df_pred.head()
Out[101]:
In [102]:
#df_pred.to_csv('pred_RF_d18_n35.csv',index=False)
#df_pred.to_csv('pred_KNN_k21.csv',index=False)
#df_pred.to_csv('pred_GBoost.csv',index=False)
df_pred.to_csv('pred_Bagging.csv',index=False)
In [ ]: