In [1]:
from sklearn.svm import SVR
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
df = pd.read_csv("./data/nci60.csv")
X = df.iloc[:,2:]
y = df.iloc[:,1]

In [3]:
# Define the rfe-svm function
def rfe_svm(X, y, num_feature_to_selected):
    import time
    clf = SVR()
    model = clf.set_params(kernel='linear')
    X_new = X
    record = {}
    keep_ = X.shape[1]-num_feature_to_selected
    start_time = time.time()
    for i in range(keep_):
        # get the column name
        names = list(X_new)
        # fit the svm and get the weights of features
        fit = model.fit(X_new, y)
        # remove the feature with the smallest weight
        X_new = X_new.drop(X_new.columns[[np.argmin(fit.coef_)]], 1)
        # record the index and value of the deleted feature
        record.update({str(names[np.argmin(fit.coef_)]):fit.coef_[0,np.argmin(fit.coef_)]})
    elapsed_time = time.time() - start_time
    return X_new, record, elapsed_time

In [6]:
# Do the feature selecltion and get the selected features
X_new, record, elapsed_time = rfe_svm(X, y, 50)
#for key,value in record.items():
#    print(key,':',value)
len(record)
elapsed_time


Out[6]:
364.68830728530884

In [7]:
# combine the label and write to the local destination
X_new.insert(0, "IC50", y)
X_new.to_csv("./selected_features/rfe_svm_out_50.csv")