In [1]:
import numpy as np
from  sklearn.ensemble import RandomForestRegressor
import sklearn
import pandas as pd
import utils

In [2]:
def choose_features(x, y):
    """
    Selecting the features of high importance to reduce feature space.
    :param x: Dataframe of features
    :param y: Dataframe of target property
    :return desired x: Dataframe of short-listed features
    """

    # Random forest feature importance
    x = np.array(x)
    y = np.array(y)

    clf = RandomForestRegressor()
    sfm = sklearn.feature_selection.SelectFromModel(clf, threshold=0.15)
    sfm.fit(x, y)
    desired_x = sfm.transform(x)

    return desired_x

In [3]:
df_x = pd.read_csv('data/df_x_preprocessing.csv')
df_y = pd.read_csv('data/df_y_preprocessing.csv')

In [4]:
df_x = utils.remove_nan_infinite(df_x)
df_y = utils.remove_nan_infinite(df_y)
print("Checked dataframe for NaN and infinite values")


Checked dataframe for NaN and infinite values

In [ ]:
# Transform all column values to mean 0 and unit variance
print("Transforming dataframe using mean and variance")
df_x = sklearn.preprocessing.scale(df_x)
df_y = sklearn.preprocessing.scale(df_y)
print("Transformed dataframe using mean and variance")


Transforming dataframe using mean and variance
Transformed dataframe using mean and variance

In [ ]:
df_x = utils.choose_features(df_x, df_y)

In [ ]: