In [1]:
from sklearn.svm import SVR
import pandas as pd
import numpy as np
In [2]:
# Load the dataset
df = pd.read_csv("./data/nci60.csv")
X = df.iloc[:,2:]
y = df.iloc[:,1]
In [3]:
# Define the rfe-svm function
def rfe_svm(X, y, num_feature_to_selected):
import time
clf = SVR()
model = clf.set_params(kernel='linear')
X_new = X
record = {}
keep_ = X.shape[1]-num_feature_to_selected
start_time = time.time()
for i in range(keep_):
# get the column name
names = list(X_new)
# fit the svm and get the weights of features
fit = model.fit(X_new, y)
# remove the feature with the smallest weight
X_new = X_new.drop(X_new.columns[[np.argmin(fit.coef_)]], 1)
# record the index and value of the deleted feature
record.update({str(names[np.argmin(fit.coef_)]):fit.coef_[0,np.argmin(fit.coef_)]})
elapsed_time = time.time() - start_time
return X_new, record, elapsed_time
In [6]:
# Do the feature selecltion and get the selected features
X_new, record, elapsed_time = rfe_svm(X, y, 50)
#for key,value in record.items():
# print(key,':',value)
len(record)
elapsed_time
Out[6]:
In [7]:
# combine the label and write to the local destination
X_new.insert(0, "IC50", y)
X_new.to_csv("./selected_features/rfe_svm_out_50.csv")