In [2]:
import sys
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
In [3]:
cd kaggle
In [4]:
train_file = 'train_processed.csv'
test_file = 'test_processed.csv'
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)
target_col = 'QuoteConversion_Flag'
features_col = [x for x in train_df.columns if x not in [target_col, 'QuoteNumber']]
In [5]:
training, testing = train_test_split(train_df, train_size=0.8)
training.info()
testing.info()
In [33]:
%%timeit -n 1 -r 1
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier as GBC
rl_clf = GBC(n_estimators = 500) #RandomForestClassifier(n_estimators=50)
rl_clf.fit(training[features_col], training[target_col])
In [34]:
print rl_clf.score(training[features_col], training[target_col])
print rl_clf.score(testing[features_col], testing[target_col])
In [11]:
t.shape
Out[11]:
In [12]:
test_df['QuoteConversion_Flag'] = t[:,1]
test_df[['QuoteNumber', 'QuoteConversion_Flag']].to_csv('submission.csv', index=False)
In [13]:
import xgboost as xgb
In [31]:
%%timeit -n 1 -r 1
# xgb_clf = xgb.XGBClassifier(n_estimators = 100, nthread=-1, max_depth=4, learning_rate=0.05,subsample=0.5,colsample_bytree=0.8)
xgb_clf = xgb.XGBClassifier()
xgb_model = xgb_clf.fit(training[features_col], training[target_col], eval_metric = "auc")
In [32]:
xgb_clf.score(testing[features_col], testing[target_col])
Out[32]: