In [1]:
import os
import pandas as pd
import math
import numpy as np
from sklearn.tree import DecisionTreeClassifier
In [2]:
data = pd.read_csv('kaggle_porto_seguro/train.csv')
data.head()
Out[2]:
In [3]:
data.info()
In [4]:
data = data.sample(frac=1).reset_index(drop=True)
In [6]:
x_train = data.iloc[:585212,2:]
y_train = data.iloc[:585212,1]
x_test = data.iloc[585212:,2:]
y_test = data.iloc[585212:,1]
y_test.head()
Out[6]:
In [7]:
d_tree = DecisionTreeClassifier(criterion='entropy')
d_tree.fit(x_train, y_train)
Out[7]:
In [8]:
d_tree.score(x_test, y_test)
Out[8]:
In [9]:
test_data = pd.read_csv('kaggle_porto_seguro/test.csv')
test_data.head()
Out[9]:
In [10]:
predictions = d_tree.predict(test_data.iloc[:,1:])
In [11]:
submission = pd.DataFrame()
submission['id'] = test_data.iloc[:, 0]
submission['target'] = predictions
submission.head()
Out[11]:
In [13]:
submission.to_csv('kaggle_submission.csv', index=False)
In [ ]: