In [1]:
    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(12,12))
%matplotlib inline
    
In [2]:
    
PATH = "data\\bulldozers\\"
train = pd.read_csv(f'{PATH}train_jDb5RBj.csv', low_memory=False)
ids, y = train['ID'], train['Purchase']
    
In [3]:
    
test = pd.read_csv(PATH + 'test_dan2xFI.csv')
    
In [4]:
    
train_ = train.drop(['ID','Purchase'],axis = 1)
test_ = test.copy()
test_ = test_.drop('ID',axis = 1)
for i in train_.columns:
    train_[i] = train_[i].apply(str)
    test_[i] = test_[i].apply(str)
    
In [5]:
    
categorical_features_indices = np.where(train_.dtypes != np.float)[0]
    
In [6]:
    
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(train_, train['Purchase'], train_size=0.8, random_state=1234)
    
    
In [7]:
    
#importing library and building model
from catboost import CatBoostClassifier
model=CatBoostClassifier(iterations=100, depth=10, learning_rate=0.01, loss_function='Logloss',class_weights=[1,3])
model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation))
    
    
    Out[7]:
In [77]:
    
model=CatBoostClassifier(iterations=300, depth=10, learning_rate=0.02, loss_function='Logloss')
model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation))
    
    
    Out[77]:
In [78]:
    
prediction_proba = model.predict_proba(test_)
    
In [79]:
    
def make_submission(probs):
    sample = pd.read_csv(PATH + 'sample.csv')
    submit = sample.copy()
    submit['Purchase'] = probs
    return submit
    
In [80]:
    
submit = make_submission(prediction_proba[:,1])
    
In [81]:
    
submit.to_csv(PATH + 'cat_300_.02.csv')
    
In [26]:
    
from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search
    
    
In [60]:
    
def modelfit(alg, dtrain, predictors, performCV=True, printFeatureImportance=True, cv_folds=5):
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], y)
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    
    #Perform cross-validation:
    if performCV:
        cv_score = cross_validation.cross_val_score(alg, dtrain[predictors], y, cv=cv_folds, scoring='roc_auc')
    
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(y , dtrain_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(y , dtrain_predprob))
    
    if performCV:
        print ("CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))
        
    #Print Feature Importance:
    if printFeatureImportance:
        feat_imp = pd.Series(alg.feature_importances_, predictors).sort_values(ascending=False)
        plt.figure(figsize=(20,20))
        feat_imp.plot(kind='bar', title='Feature Importances')
        plt.ylabel('Feature Importance Score')
    
In [40]:
    
#Choose all predictors except target & IDcols
predictors = train_.columns
gbm0 = GradientBoostingClassifier(random_state=10)
modelfit(gbm0, train, predictors)
    
    
    
In [45]:
    
param_test1 = {'n_estimators':[20, 30, 40, 50, 60, 70, 80, 90]}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, min_samples_split=500,
                        min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.8,random_state=10), 
                        param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(train[predictors], y)
    
    Out[45]:
In [46]:
    
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
    
    Out[46]:
In [55]:
    
## Test 2
param_test2 = {'max_depth':[5, 7, 9, 11, 13, 15] ,'min_samples_split': [200, 400, 600, 800, 1000]}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, n_estimators=70, max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
    
In [56]:
    
gsearch2.fit(train[predictors], y)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
    
    Out[56]:
In [57]:
    
#test 3
param_test3 = {'min_samples_split': [800, 1000, 1200, 1400, 1600] , 'min_samples_leaf': [30, 40, 50, 60, 70]}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, n_estimators=70,max_depth=5,max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
    
In [58]:
    
gsearch3.fit(train[predictors], y)
gsearch3.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
    
    Out[58]:
In [61]:
    
modelfit(gsearch3.best_estimator_, train, predictors)
    
    
    
In [69]:
    
#test 4
param_test4 = {'max_features': [7, 9, 11, 13, 15, 17, 19, 21]}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, min_samples_split = 800, n_estimators=70,max_depth=5,max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
    
In [70]:
    
gsearch4.fit(train[predictors], y )
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_
    
    Out[70]:
In [71]:
    
#test 5
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, min_samples_split = 800, 
                                                               n_estimators=70,max_depth=5,max_features= 9 , 
                                                               subsample=0.8, random_state=10), 
param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
    
In [73]:
    
gsearch5.fit(train[predictors], y )
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_
    
    Out[73]:
In [74]:
    
gbm_tuned_1 = GradientBoostingClassifier(learning_rate=0.025, n_estimators=140,max_depth=5,
                                         min_samples_split=800,min_samples_leaf=60, 
                                         subsample=0.85, random_state=10, max_features=9)
modelfit(gbm_tuned_1, train, predictors)
    
    
    
In [75]:
    
gbm_tuned_2 = GradientBoostingClassifier(learning_rate=0.05, n_estimators=900,max_depth=5, 
                                         min_samples_split=800,min_samples_leaf=60, 
                                         subsample=0.85, random_state=10, max_features=9)
modelfit(gbm_tuned_2, train, predictors)
    
    
    
In [76]:
    
prediction_proba_2 = gbm_tuned_2.predict_proba(test_)
    
In [77]:
    
submit = make_submission(prediction_proba_2[:,1])
    
In [78]:
    
submit.to_csv(PATH + 'gbm_tuned_2.csv')
    
In [97]:
    
def ensemble():
    stacked_1 = pd.read_csv('ensemble_.csv')#.842
    stacked_2 = pd.read_csv('315_37929_us_ensemble.csv')#.842
    stacked_3 = pd.read_csv('315_67174_us_asd.csv')#.838
    stacked_4 = pd.read_csv('315_67174_us_cat_1.csv')#.835
    stacked_5 = pd.read_csv('ensemble_.csv')#835
    stacked_6 = pd.read_csv('ensemble_.csv')#.841
    
    sub = pd.DataFrame()
    sub['ID'] = stacked_1['ID']
    sub['Purchase'] = (np.mean(
        [
            stacked_1['Purchase'].apply(lambda x: np.abs(x)), \
            stacked_2['Purchase'].apply(lambda x: np.abs(x)), \
            stacked_3['Purchase'].apply(lambda x: np.abs(x)), \
            stacked_4['Purchase'].apply(lambda x: np.abs(x)), \
            stacked_5['Purchase'].apply(lambda x: np.abs(x)), \
            stacked_6['Purchase'].apply(lambda x: np.abs(x)), \
            ], axis=0))
    sub.to_csv('ensemble_3_last.csv', index=False, float_format='%.6f')
    
In [98]:
    
ensemble()