In [ ]:
%matplotlib inline
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [ ]:
hits_train = pd.read_csv("data/train.csv", index_col='global_id')
hits_test = pd.read_csv("data/test.csv", index_col='global_id')
wires = pd.read_csv("data/wires.csv", index_col='wire_id')

In [ ]:
rf_classifier = RandomForestClassifier(criterion='gini', n_estimators=200)
classifier = AdaBoostClassifier(base_estimator=rf_classifier, n_estimators=100)

In [ ]:
hits_train.loc[:, 'wire_rho'] = wires.loc[hits_train.wire_id.values, "wire_rho"].values
hits_train.loc[:, 'wire_phi'] = wires.loc[hits_train.wire_id.values, "wire_phi"].values

hits_train['rho_cos_phi'] = hits_train['wire_rho'] * np.cos(hits_train['wire_phi'])
hits_train['rho_sin_phi'] = hits_train['wire_rho'] * np.sin(hits_train['wire_phi'])

hits_train['transformed_energy'] = np.log(hits_train['energy_deposit'])**2
hits_train['transformed_time'] = hits_train['relative_time']**2

_features = ['transformed_energy', 'transformed_time', 'wire_rho', 'wire_phi', 'rho_cos_phi', 'rho_sin_phi']

In [ ]:
hits_test.loc[:, 'wire_rho'] = wires.loc[hits_test.wire_id.values, "wire_rho"].values
hits_test.loc[:, 'wire_phi'] = wires.loc[hits_test.wire_id.values, "wire_phi"].values

hits_test['rho_cos_phi'] = hits_test['wire_rho'] * np.cos(hits_test['wire_phi'])
hits_test['rho_sin_phi'] = hits_test['wire_rho'] * np.sin(hits_test['wire_phi'])

hits_test['transformed_energy'] = np.log(hits_test['energy_deposit'])**2
hits_test['transformed_time'] = hits_test['relative_time']**2
candidates = hits_test.loc[hits_test.energy_deposit > 0]

In [ ]:
classifier.fit(hits_train[hits_train.energy_deposit > 0][_features], 
               (hits_train[hits_train.energy_deposit > 0].label == 1).values)

In [ ]:
prediction = pd.DataFrame({
    "prediction": classifier.predict_proba(candidates[_features])[:, 1]}, index=candidates.index)

In [ ]:
prediction.to_csv("prediction.csv", index_label='global_id')