In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(12,12))
%matplotlib inline
In [2]:
PATH = "data\\bulldozers\\"
train = pd.read_csv(f'{PATH}train_jDb5RBj.csv', low_memory=False)
ids, y = train['ID'], train['Purchase']
In [3]:
test = pd.read_csv(PATH + 'test_dan2xFI.csv')
In [4]:
train_ = train.drop(['ID','Purchase'],axis = 1)
test_ = test.copy()
test_ = test_.drop('ID',axis = 1)
for i in train_.columns:
train_[i] = train_[i].apply(str)
test_[i] = test_[i].apply(str)
In [5]:
categorical_features_indices = np.where(train_.dtypes != np.float)[0]
In [6]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(train_, train['Purchase'], train_size=0.8, random_state=1234)
In [7]:
#importing library and building model
from catboost import CatBoostClassifier
model=CatBoostClassifier(iterations=100, depth=10, learning_rate=0.01, loss_function='Logloss',class_weights=[1,3])
model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation))
Out[7]:
In [77]:
model=CatBoostClassifier(iterations=300, depth=10, learning_rate=0.02, loss_function='Logloss')
model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation))
Out[77]:
In [78]:
prediction_proba = model.predict_proba(test_)
In [79]:
def make_submission(probs):
sample = pd.read_csv(PATH + 'sample.csv')
submit = sample.copy()
submit['Purchase'] = probs
return submit
In [80]:
submit = make_submission(prediction_proba[:,1])
In [81]:
submit.to_csv(PATH + 'cat_300_.02.csv')
In [26]:
from sklearn.ensemble import GradientBoostingClassifier #GBM algorithm
from sklearn import cross_validation, metrics #Additional scklearn functions
from sklearn.grid_search import GridSearchCV #Perforing grid search
In [60]:
def modelfit(alg, dtrain, predictors, performCV=True, printFeatureImportance=True, cv_folds=5):
#Fit the algorithm on the data
alg.fit(dtrain[predictors], y)
#Predict training set:
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
#Perform cross-validation:
if performCV:
cv_score = cross_validation.cross_val_score(alg, dtrain[predictors], y, cv=cv_folds, scoring='roc_auc')
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.4g" % metrics.accuracy_score(y , dtrain_predictions))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y , dtrain_predprob))
if performCV:
print ("CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))
#Print Feature Importance:
if printFeatureImportance:
feat_imp = pd.Series(alg.feature_importances_, predictors).sort_values(ascending=False)
plt.figure(figsize=(20,20))
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
In [40]:
#Choose all predictors except target & IDcols
predictors = train_.columns
gbm0 = GradientBoostingClassifier(random_state=10)
modelfit(gbm0, train, predictors)
In [45]:
param_test1 = {'n_estimators':[20, 30, 40, 50, 60, 70, 80, 90]}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, min_samples_split=500,
min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.8,random_state=10),
param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(train[predictors], y)
Out[45]:
In [46]:
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
Out[46]:
In [55]:
## Test 2
param_test2 = {'max_depth':[5, 7, 9, 11, 13, 15] ,'min_samples_split': [200, 400, 600, 800, 1000]}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, n_estimators=70, max_features='sqrt', subsample=0.8, random_state=10),
param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
In [56]:
gsearch2.fit(train[predictors], y)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
Out[56]:
In [57]:
#test 3
param_test3 = {'min_samples_split': [800, 1000, 1200, 1400, 1600] , 'min_samples_leaf': [30, 40, 50, 60, 70]}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, n_estimators=70,max_depth=5,max_features='sqrt', subsample=0.8, random_state=10),
param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
In [58]:
gsearch3.fit(train[predictors], y)
gsearch3.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
Out[58]:
In [61]:
modelfit(gsearch3.best_estimator_, train, predictors)
In [69]:
#test 4
param_test4 = {'max_features': [7, 9, 11, 13, 15, 17, 19, 21]}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, min_samples_split = 800, n_estimators=70,max_depth=5,max_features='sqrt', subsample=0.8, random_state=10),
param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
In [70]:
gsearch4.fit(train[predictors], y )
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_
Out[70]:
In [71]:
#test 5
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, min_samples_split = 800,
n_estimators=70,max_depth=5,max_features= 9 ,
subsample=0.8, random_state=10),
param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
In [73]:
gsearch5.fit(train[predictors], y )
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_
Out[73]:
In [74]:
gbm_tuned_1 = GradientBoostingClassifier(learning_rate=0.025, n_estimators=140,max_depth=5,
min_samples_split=800,min_samples_leaf=60,
subsample=0.85, random_state=10, max_features=9)
modelfit(gbm_tuned_1, train, predictors)
In [75]:
gbm_tuned_2 = GradientBoostingClassifier(learning_rate=0.05, n_estimators=900,max_depth=5,
min_samples_split=800,min_samples_leaf=60,
subsample=0.85, random_state=10, max_features=9)
modelfit(gbm_tuned_2, train, predictors)
In [76]:
prediction_proba_2 = gbm_tuned_2.predict_proba(test_)
In [77]:
submit = make_submission(prediction_proba_2[:,1])
In [78]:
submit.to_csv(PATH + 'gbm_tuned_2.csv')
In [97]:
def ensemble():
stacked_1 = pd.read_csv('ensemble_.csv')#.842
stacked_2 = pd.read_csv('315_37929_us_ensemble.csv')#.842
stacked_3 = pd.read_csv('315_67174_us_asd.csv')#.838
stacked_4 = pd.read_csv('315_67174_us_cat_1.csv')#.835
stacked_5 = pd.read_csv('ensemble_.csv')#835
stacked_6 = pd.read_csv('ensemble_.csv')#.841
sub = pd.DataFrame()
sub['ID'] = stacked_1['ID']
sub['Purchase'] = (np.mean(
[
stacked_1['Purchase'].apply(lambda x: np.abs(x)), \
stacked_2['Purchase'].apply(lambda x: np.abs(x)), \
stacked_3['Purchase'].apply(lambda x: np.abs(x)), \
stacked_4['Purchase'].apply(lambda x: np.abs(x)), \
stacked_5['Purchase'].apply(lambda x: np.abs(x)), \
stacked_6['Purchase'].apply(lambda x: np.abs(x)), \
], axis=0))
sub.to_csv('ensemble_3_last.csv', index=False, float_format='%.6f')
In [98]:
ensemble()