notebook.community

Edit and run



In [1]:

    
import numpy as np
from  sklearn.ensemble import RandomForestRegressor
import sklearn
import pandas as pd
import utils



In [2]:

    
def choose_features(x, y):
    """
    Selecting the features of high importance to reduce feature space.
    :param x: Dataframe of features
    :param y: Dataframe of target property
    :return desired x: Dataframe of short-listed features
    """

    # Random forest feature importance
    x = np.array(x)
    y = np.array(y)

    clf = RandomForestRegressor()
    sfm = sklearn.feature_selection.SelectFromModel(clf, threshold=0.15)
    sfm.fit(x, y)
    desired_x = sfm.transform(x)

    return desired_x



In [3]:

    
df_x = pd.read_csv('data/df_x_preprocessing.csv')
df_y = pd.read_csv('data/df_y_preprocessing.csv')



In [4]:

    
df_x = utils.remove_nan_infinite(df_x)
df_y = utils.remove_nan_infinite(df_y)
print("Checked dataframe for NaN and infinite values")









    



Checked dataframe for NaN and infinite values



In [ ]:

    
# Transform all column values to mean 0 and unit variance
print("Transforming dataframe using mean and variance")
df_x = sklearn.preprocessing.scale(df_x)
df_y = sklearn.preprocessing.scale(df_y)
print("Transformed dataframe using mean and variance")









    



Transforming dataframe using mean and variance
Transformed dataframe using mean and variance



In [ ]:

    
df_x = utils.choose_features(df_x, df_y)



In [ ]: