In [1]:
import pandas as pd
In [2]:
escolas_exp = pd.read_csv('../dados/2013/TS_ESCOLA_average_exploration_data.csv')
escolas_val = pd.read_csv('../dados/2013/TS_ESCOLA_average_validation_data.csv')
In [114]:
escolas_exp.info()
In [3]:
corrs = escolas_exp.filter(regex='TX_RESP').corr()
In [147]:
# from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
def feature_score(escolas_dataset, feature):
escolas_dataset[feature].corr(escolas_dataset.MEDIA_9EF_MT)
# X = escolas_dataset[feature].values.reshape(-1, 1)
# Y = escolas_dataset.MEDIA_9EF_MT.values
# return DecisionTreeRegressor().fit(X, Y).score(X, Y)
In [148]:
kept_features = corrs.columns.values.tolist()
correlation_tolerance = 0.40
for feature_name, correlations in corrs.items():
if feature_name not in kept_features:
continue
sorted_correlations = list(correlations.filter(axis=0, items=kept_features).abs().sort_values().items())
if len(sorted_correlations) < 2:
continue
most_correlated_feature = sorted_correlations[-2] # skip correlation with itself
if most_correlated_feature[1] < correlation_tolerance:
continue
if feature_score(escolas_exp, most_correlated_feature[0]) > feature_score(escolas_exp, feature_name):
print('{} vs {}: kept {}'.format(feature_name, most_correlated_feature[0], most_correlated_feature[0]))
kept_features.remove(feature_name)
else:
print('{} vs {}: kept {}'.format(feature_name, most_correlated_feature[0], feature_name))
kept_features.remove(most_correlated_feature[0])
In [139]:
X = escolas_exp.filter(regex='TX_RESP').values
Y = escolas_exp.MEDIA_9EF_MT
rf = DecisionTreeRegressor().fit(X, Y)
X_test = escolas_val.filter(regex='TX_RESP').values
Y_test = escolas_val.MEDIA_9EF_MT
rf.score(X_test, Y_test)
Out[139]:
In [130]:
corrs.TX_RESP_Q033.sort_values()
Out[130]: