In [2]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
import pandas as pd
import numpy as np
import time

In [3]:
# Load the dataset
df = pd.read_csv("./data/nci60.csv")
X = df.iloc[:,2:]
y = df.iloc[:,1]

In [4]:
clf = SVR()
estimator = clf.set_params(kernel='linear')
selector = RFE(estimator, 50, step=1)
start_time = time.time()
selector = selector.fit(X, y)
elapsed_time = time.time() - start_time
elapsed_time


Out[4]:
281.8505299091339

In [24]:
from sklearn.linear_model import (LinearRegression, Ridge, 
                                  Lasso, RandomizedLasso)
ridge = Ridge(alpha=7)
selector = RFE(ridge, 50, step=1)
start_time = time.time()
selector = selector.fit(X, y)
elapsed_time = time.time() - start_time
elapsed_time


Out[24]:
152.045969247818

In [13]:
print(selector.ranking_)
print(selector.estimator_)


[10615 10614 10613 ...,  9074  9753  9768]
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [25]:
# X_new = X.iloc[:,selector.ranking_]
X_new = X.iloc[:,selector.support_]

In [26]:
X_new.insert(0, "IC50", y)
X_new.to_csv("./selected_features/rfe_sklean_ridge_50.csv")