In [1]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import sklearn
import pandas as pd
import utils
In [2]:
def choose_features(x, y):
"""
Selecting the features of high importance to reduce feature space.
:param x: Dataframe of features
:param y: Dataframe of target property
:return desired x: Dataframe of short-listed features
"""
# Random forest feature importance
x = np.array(x)
y = np.array(y)
clf = RandomForestRegressor()
sfm = sklearn.feature_selection.SelectFromModel(clf, threshold=0.15)
sfm.fit(x, y)
desired_x = sfm.transform(x)
return desired_x
In [3]:
df_x = pd.read_csv('data/df_x_preprocessing.csv')
df_y = pd.read_csv('data/df_y_preprocessing.csv')
In [4]:
df_x = utils.remove_nan_infinite(df_x)
df_y = utils.remove_nan_infinite(df_y)
print("Checked dataframe for NaN and infinite values")
In [ ]:
# Transform all column values to mean 0 and unit variance
print("Transforming dataframe using mean and variance")
df_x = sklearn.preprocessing.scale(df_x)
df_y = sklearn.preprocessing.scale(df_y)
print("Transformed dataframe using mean and variance")
In [ ]:
df_x = utils.choose_features(df_x, df_y)
In [ ]: