In [2]:
import os
import pandas as pd
import math
import numpy as np
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
%matplotlib inline
In [74]:
columns = ["buying", "maint", "doors", "persons","lug_boot", "safety"]
cardf = pd.read_csv("car_data.csv", names=columns)
cardf = cardf.sample(frac=1).reset_index(drop=True)
cardf.info()
In [75]:
cardf.head()
Out[75]:
In [55]:
# Transforma as features categóricas em numéricas
cat_features = ['buying','maint','doors','persons','lug_boot','safety']
for feature in cat_features:
cardf[feature] = cardf[feature].astype('category').cat.codes
In [48]:
# Divide os dados em treino e teste
msk = np.random.rand(len(cardf))<0.7
ytrain = cardf['class'].iloc[msk]
xtrain = cardf[msk].drop(['class'],axis=1)
ytest = cardf['class'].iloc[~msk]
xtest = cardf[~msk].drop(['class'],axis=1)
In [88]:
# Classifica usando árvode de decisão com criério baseado em entropy/gini
dtree_entropy = DecisionTreeClassifier(criterion = 'entropy')
dtree_entropy.fit(xtrain,ytrain)
entropy_score = dtree_entropy.score(xtest,ytest)
print('Score using entropy criterion:',entropy_score)
dtree_gini = DecisionTreeClassifier(criterion = 'gini')
dtree_gini.fit(xtrain,ytrain)
gini_score = dtree.score(xtest,ytest)
print('Score using gini criterion:',gini_score)
In [101]:
plt.barh(range(len(columns)),dtree_gini.feature_importances_,align='center',alpha=0.4,color='b',label='Gini')
plt.barh(range(len(columns)),dtree_entropy.feature_importances_,align='center',alpha=0.4,color='g',label='Entropy')
plt.yticks(range(len(columns)),cat_features)
plt.xlabel('Feature importance')
plt.ylabel('Feature')
plt.legend(loc='best')
plt.show()
2- Faça o balanceamento dos dados contidos em "train.csv", aplique o algoritmo de Decision Tree e faça a submissão no kaggle. Tente melhorar o resultado obtido em sala de aula (posição 3100 no leaderboard).
In [118]:
# Carrega os dados
portodf = pd.read_csv('train/train.csv')
testsdf = pd.read_csv('test/test.csv')
In [7]:
# Visualizando os dados
portodf.head()
Out[7]:
In [12]:
portodf.target.value_counts()
Out[12]:
Dados desbalanceados.
In [115]:
# Balanceando os dados
samples = 25000
unbalanced_idx = portodf[portodf.target==0].index
balanced_idx = portodf[portodf.target==1].index
selected_idx1 = set(np.random.choice(unbalanced_idx,samples))
selected_idx = selected_idx1.union(set(balanced_idx))
sampled_portodf = portodf.loc[selected_idx]
#sampled_portodf = pd.DataFrame(sampled_portodf.values,columns=sampled_portodf.columns)
In [108]:
# Dividindo em treino e teste
def split_train(data):
msk = np.random.rand(len(data))<0.7
xtrain = data.drop(['target'],axis=1).iloc[msk]
ytrain = data.target[msk]
xtest = data.drop(['target'],axis=1).iloc[~msk]
ytest = sampled_portodf.target[~msk]
return xtrain,ytrain,xtest,ytest
In [109]:
# Instanciando uma árvore de decisão
from sklearn.metrics import classification_report
xtrain,ytrain,xtest,ytest = split_train(sampled_portodf)
dtree = DecisionTreeClassifier()
dtree.fit(xtrain,ytrain)
predictions = dtree.predict(xtest)
print(classification_report(ytest,predictions))
In [113]:
xtrain,ytrain,xtest,ytest = split_train(sampled_portodf)
dtree = DecisionTreeClassifier()
dtree.fit(xtrain,ytrain)
predictions = dtree.predict(xtest)
print(classification_report(ytest,predictions))
Apenas amostrando o dataset para torná-lo mais balanceado, nota-se uma melhora em ambas as métrica precision/recall.
In [125]:
predictions = dtree.predict(testsdf)
submission = pd.DataFrame()
submission['id'] = testsdf.iloc[:, 0]
submission['target'] = predictions
In [121]:
submission.to_csv('kaggle_submission.csv', index=False)
In [124]:
submission.target.value_counts()
Out[124]:
In [ ]: