In [1]:
import pandas as pd

In [2]:
escolas_exp = pd.read_csv('../dados/2013/TS_ESCOLA_average_exploration_data.csv')
escolas_val = pd.read_csv('../dados/2013/TS_ESCOLA_average_validation_data.csv')

In [114]:
escolas_exp.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22664 entries, 0 to 22663
Columns: 129 entries, Unnamed: 0 to TX_RESP_Q074
dtypes: float64(120), int64(9)
memory usage: 22.3 MB

In [3]:
corrs = escolas_exp.filter(regex='TX_RESP').corr()

In [147]:
# from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

def feature_score(escolas_dataset, feature):
    escolas_dataset[feature].corr(escolas_dataset.MEDIA_9EF_MT)
#     X = escolas_dataset[feature].values.reshape(-1, 1)
#     Y = escolas_dataset.MEDIA_9EF_MT.values
#     return DecisionTreeRegressor().fit(X, Y).score(X, Y)

In [148]:
kept_features = corrs.columns.values.tolist()
correlation_tolerance = 0.40

for feature_name, correlations in corrs.items():
    if feature_name not in kept_features:
        continue
        
    sorted_correlations = list(correlations.filter(axis=0, items=kept_features).abs().sort_values().items())
    if len(sorted_correlations) < 2:
        continue
        
    most_correlated_feature = sorted_correlations[-2] # skip correlation with itself
    if most_correlated_feature[1] < correlation_tolerance:
        continue
    
    if feature_score(escolas_exp, most_correlated_feature[0]) > feature_score(escolas_exp, feature_name):
        print('{} vs {}: kept {}'.format(feature_name, most_correlated_feature[0], most_correlated_feature[0]))
        kept_features.remove(feature_name)
    else:
        print('{} vs {}: kept {}'.format(feature_name, most_correlated_feature[0], feature_name))
        kept_features.remove(most_correlated_feature[0])


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-148-227e6eac96a2> in <module>()
     14         continue
     15 
---> 16     if feature_score(escolas_exp, most_correlated_feature[0]) > feature_score(escolas_exp, feature_name):
     17         print('{} vs {}: kept {}'.format(feature_name, most_correlated_feature[0], most_correlated_feature[0]))
     18         kept_features.remove(feature_name)

TypeError: unorderable types: NoneType() > NoneType()

In [139]:
X = escolas_exp.filter(regex='TX_RESP').values
Y = escolas_exp.MEDIA_9EF_MT
rf = DecisionTreeRegressor().fit(X, Y)

X_test = escolas_val.filter(regex='TX_RESP').values
Y_test = escolas_val.MEDIA_9EF_MT
rf.score(X_test, Y_test)


Out[139]:
-0.55866887020121503

In [130]:
corrs.TX_RESP_Q033.sort_values()


Out[130]:
TX_RESP_Q036   -0.114767
TX_RESP_Q050    0.029233
TX_RESP_Q052    0.048548
TX_RESP_Q025    0.056772
TX_RESP_Q064    0.062637
TX_RESP_Q063    0.062688
TX_RESP_Q062    0.070993
TX_RESP_Q024    0.074789
TX_RESP_Q026    0.077627
TX_RESP_Q066    0.084553
TX_RESP_Q061    0.086716
TX_RESP_Q074    0.088906
TX_RESP_Q007    0.091201
TX_RESP_Q069    0.094568
TX_RESP_Q035    0.094939
TX_RESP_Q028    0.104592
TX_RESP_Q058    0.109321
TX_RESP_Q055    0.111592
TX_RESP_Q009    0.113171
TX_RESP_Q029    0.114275
TX_RESP_Q027    0.117920
TX_RESP_Q015    0.123477
TX_RESP_Q019    0.125819
TX_RESP_Q072    0.126637
TX_RESP_Q068    0.127051
TX_RESP_Q008    0.127859
TX_RESP_Q034    0.128882
TX_RESP_Q073    0.132695
TX_RESP_Q017    0.135316
TX_RESP_Q014    0.139166
                  ...   
TX_RESP_Q012    0.148831
TX_RESP_Q011    0.150429
TX_RESP_Q051    0.153083
TX_RESP_Q059    0.154571
TX_RESP_Q018    0.156994
TX_RESP_Q057    0.157092
TX_RESP_Q022    0.158333
TX_RESP_Q048    0.158611
TX_RESP_Q021    0.159417
TX_RESP_Q010    0.160720
TX_RESP_Q020    0.168343
TX_RESP_Q043    0.169405
TX_RESP_Q049    0.169862
TX_RESP_Q053    0.171332
TX_RESP_Q031    0.172574
TX_RESP_Q047    0.174979
TX_RESP_Q070    0.176031
TX_RESP_Q065    0.177536
TX_RESP_Q054    0.179692
TX_RESP_Q032    0.183469
TX_RESP_Q042    0.191066
TX_RESP_Q039    0.193126
TX_RESP_Q040    0.193265
TX_RESP_Q038    0.200821
TX_RESP_Q056    0.203984
TX_RESP_Q041    0.204577
TX_RESP_Q045    0.206495
TX_RESP_Q060    0.216962
TX_RESP_Q037    0.227165
TX_RESP_Q033    1.000000
Name: TX_RESP_Q033, dtype: float64