In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
import datetime
In [2]:
def now():
return '_'.join(str(datetime.datetime.now()).split())
In [3]:
df_train = pd.read_csv('raw_data/train.csv')
df_test = pd.read_csv('raw_data/test.csv')
In [4]:
def convert_class_label_to_int(class_label):
return int(class_label[6:]) - 1
In [5]:
def find_means_stds(features_array):
means = [np.mean(features_array[:, i]) for i in range(features_array.shape[1])]
stds = [np.std(features_array[:, i]) for i in range(features_array.shape[1])]
return {'means': means, 'stds': stds}
In [6]:
def z_score_feature(feature_slice, mean, std):
return (feature_slice - mean) / std
In [7]:
renamed_labels = [convert_class_label_to_int(i) for i in df_train['target'].values]
feature_columns = ['feat_' + str(i + 1) for i in range(93)]
df_train['renamed_labels'] = renamed_labels
In [8]:
mstddict = find_means_stds(df_train[feature_columns].values)
In [9]:
for i in range(93):
df_train['feat_' + str(i + 1)] = z_score_feature(df_train['feat_' + str(i + 1)].values, mstddict['means'][i],
mstddict['stds'][i])
In [10]:
for i in range(93):
df_test['feat_' + str(i + 1)] = z_score_feature(df_test['feat_' + str(i + 1)].values, mstddict['means'][i],
mstddict['stds'][i])
In [11]:
# rf = AdaBoostClassifier(n_estimators=100)
# clrf = RandomForestClassifier(n_estimators=400, max_features=25)
# clf = AdaBoostClassifier(base_estimator=clrf, n_estimators=15)
# clf2 = GradientBoostingClassifier(n_estimators=250, max_depth=8, max_features=15)
clf = GradientBoostingClassifier(n_estimators=700, max_depth=7, max_features=20, learning_rate=0.03)
In [12]:
clf.fit(df_train[feature_columns].values, df_train['renamed_labels'].values)
Out[12]:
In [13]:
labels_pr = clf.predict_proba(df_test[feature_columns].values)
labels_pr_tr = clf.predict_proba(df_train[feature_columns].values)
In [14]:
now_name = now()
predict_dict = {'id': df_test['id'].values}
for i in range(9):
predict_dict['Class_' + str(i + 1)] = labels_pr[:, i]
df_sub = pd.DataFrame(predict_dict)
fname = 'submissions/' + now_name + '.csv'
df_sub.to_csv(fname, index=False)
predict_dict = {'id': df_train['id'].values}
for i in range(9):
predict_dict['Class_' + str(i + 1)] = labels_pr_tr[:, i]
df_sub = pd.DataFrame(predict_dict)
fname = 'train_predict/' + now_name + '.csv'
df_sub.to_csv(fname, index=False)
In [ ]: