In [ ]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
In [ ]:
hits_train = pd.read_csv("data/train.csv", index_col='global_id')
hits_test = pd.read_csv("data/test.csv", index_col='global_id')
wires = pd.read_csv("data/wires.csv", index_col='wire_id')
In [ ]:
rf_classifier = RandomForestClassifier(criterion='gini', n_estimators=200)
classifier = AdaBoostClassifier(base_estimator=rf_classifier, n_estimators=100)
In [ ]:
hits_train.loc[:, 'wire_rho'] = wires.loc[hits_train.wire_id.values, "wire_rho"].values
hits_train.loc[:, 'wire_phi'] = wires.loc[hits_train.wire_id.values, "wire_phi"].values
hits_train['rho_cos_phi'] = hits_train['wire_rho'] * np.cos(hits_train['wire_phi'])
hits_train['rho_sin_phi'] = hits_train['wire_rho'] * np.sin(hits_train['wire_phi'])
hits_train['transformed_energy'] = np.log(hits_train['energy_deposit'])**2
hits_train['transformed_time'] = hits_train['relative_time']**2
_features = ['transformed_energy', 'transformed_time', 'wire_rho', 'wire_phi', 'rho_cos_phi', 'rho_sin_phi']
In [ ]:
hits_test.loc[:, 'wire_rho'] = wires.loc[hits_test.wire_id.values, "wire_rho"].values
hits_test.loc[:, 'wire_phi'] = wires.loc[hits_test.wire_id.values, "wire_phi"].values
hits_test['rho_cos_phi'] = hits_test['wire_rho'] * np.cos(hits_test['wire_phi'])
hits_test['rho_sin_phi'] = hits_test['wire_rho'] * np.sin(hits_test['wire_phi'])
hits_test['transformed_energy'] = np.log(hits_test['energy_deposit'])**2
hits_test['transformed_time'] = hits_test['relative_time']**2
candidates = hits_test.loc[hits_test.energy_deposit > 0]
In [ ]:
classifier.fit(hits_train[hits_train.energy_deposit > 0][_features],
(hits_train[hits_train.energy_deposit > 0].label == 1).values)
In [ ]:
prediction = pd.DataFrame({
"prediction": classifier.predict_proba(candidates[_features])[:, 1]}, index=candidates.index)
In [ ]:
prediction.to_csv("prediction.csv", index_label='global_id')