In [19]:
from sklearn.svm import SVR
from minepy import MINE
import pandas as pd
import numpy as np
In [20]:
filename = "./data/nci60.csv"
df = pd.read_csv(filename)
X = df.iloc[:,2:]
y = df.iloc[:,1]
In [21]:
# Define the rfe-svm function
def rfe_svm_mic(X, y, num_feature_to_selected):
import time
clf = SVR()
model = clf.set_params(kernel='linear')
X_new = X
record = {}
del_ = X.shape[1]-num_feature_to_selected
start_time = time.time()
for i in range(del_):
# get the column name
names = list(X_new)
# fit the svm and get the weights of features
fit = model.fit(X_new, y)
# compute the mic scores
mine = MINE()
mic_scores = []
for i in range(X_new.shape[1]):
mine.compute_score(X_new.iloc[:,i], y)
m = mine.mic()
mic_scores.append(m)
sum_cor = {}
# sum_cor = (1-0.3)*svm_coef + 0.3 * mic_scores
for i in range(X_new.shape[1]):
sum_cor[names[i]] = 0.5 * fit.coef_[0][i] + 0.5 * mic_scores[i]
# get the column name with the samllest value
min_colname = min(sum_cor, key=sum_cor.get)
# print(min_colname)
# get the column number with the samllest value
min_colnum = X.columns.get_loc(min_colname)
# print(min_colnum)
# print(sum_cor[min_colname])
# remove the feature with the smallest weight
X_new = X_new.drop(min_colname, axis=1)
# record the index and value of the deleted feature
# record.update({min_colname:sum_cor[min_colname]})
record[min_colname] = sum_cor[min_colname]
elapsed_time = time.time() - start_time
return X_new, record, elapsed_time
In [22]:
# Do the feature selecltion and get the selected features
X_new, record, elapsed_time = rfe_svm_mic(X, y, 50)
#for key,value in record.items():
# print(key,':',value)
len(record)
elapsed_time
Out[22]:
In [23]:
# combine the label and write to the local destination
X_new.insert(0, "IC50", y)
X_new.to_csv("./selected_features/rfe_svm_mic_out_50_with_50percent.csv")