In [1]:
import time,os
%matplotlib inline
#IMPORT ALL THE THINGS
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
In [3]:
import h2o
h2o.connect()
xy_train = h2o.import_file(path = os.path.realpath("../input/h2o_xy_CemoRes_IAMG.csv"))
In [4]:
xy_train_df = xy_train.as_data_frame(use_pandas=True)
print (xy_train_df.shape)
In [14]:
xy_train_df.iloc[0:2,0:2]
Out[14]:
In [15]:
xy_train_df.iloc[0:2,0:2].values
Out[15]:
In [16]:
xy_train[0:2,0:2]
Out[16]:
In [5]:
X = xy_train.col_names[0:13330]
y = xy_train.col_names[13330]
y
Out[5]:
In [6]:
threshold = (.8 * (1 - .8))
sel = VarianceThreshold(threshold=(.8 * (1 - .8))) # more than 80% of the samples have same value in spesific feature
In [17]:
xy_train_np_vari_x = sel.fit_transform(xy_train_df[X[1:]].values)
In [19]:
print(xy_train_df[X[1:]].values.shape)
print(xy_train_np_vari_x.shape)
In [24]:
bool_mask = sel.variances_ > threshold
idx = np.where(sel.variances_ > threshold)[0]
In [45]:
def diff(first, second):
second = set(second)
return [item for item in first if item not in second]
In [47]:
print(idx.shape)
print((idx+1).astype(int)[0:])
selected_colnames = []
for i in (idx+1).astype(int):
selected_colnames.append(X[i])
print(selected_colnames[0:5])
removed_colnames = diff(X[1:],selected_colnames)
print(removed_colnames[0:5])
In [ ]:
In [48]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
In [51]:
clf = ExtraTreesClassifier()
clf = clf.fit(xy_train_df[X[1:]].values, xy_train_df[y].values)
varimp_np = clf.feature_importances_
In [64]:
clf.score(xy_train_df[X[1:]].values, xy_train_df[y].values)
Out[64]:
In [67]:
clf.decision_path(xy_train_df[X[1:]].values)
Out[67]:
In [54]:
varimp_df = pd.DataFrame(varimp_np,index=X[1:])
In [56]:
varimp_df.hist()
Out[56]:
In [57]:
varimp_df.plot()
Out[57]:
In [58]:
varimp_df.to_csv("RF_imp.csv")
In [68]:
from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
In [69]:
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(xy_train_df[X[1:]].values, xy_train_df[y].values)
In [70]:
lsvc.score(xy_train_df[X[1:]].values, xy_train_df[y].values)
Out[70]:
In [74]:
lsvc.coef_.T.shape
Out[74]:
In [75]:
varCoef_df = pd.DataFrame(lsvc.coef_.T,index=X[1:])
In [ ]:
In [76]:
varCoef_df.hist()
Out[76]:
In [77]:
varCoef_df.plot()
Out[77]:
In [79]:
varCoef_df.to_csv("SVM_l1_imp.csv")
In [81]:
from sklearn.linear_model import LogisticRegression
In [82]:
logr = LogisticRegression(C=0.01, penalty="l1", dual=False).fit(xy_train_df[X[1:]].values, xy_train_df[y].values)
In [83]:
logr.score(xy_train_df[X[1:]].values, xy_train_df[y].values)
Out[83]:
In [84]:
varCoef_logr_df = pd.DataFrame(logr.coef_.T,index=X[1:])
In [85]:
varCoef_logr_df.plot()
Out[85]:
In [86]:
varCoef_logr_df.hist()
Out[86]:
In [87]:
varCoef_logr_df.to_csv("logr_l1_imp.csv")
In [88]:
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt
In [89]:
# Create the RFE object and rank each feature
svc = SVC(kernel="linear", C=0.1)
rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
rfe.fit(xy_train_df[X[1:]].values, xy_train_df[y].values)
ranking = rfe.ranking_
In [91]:
ranking.shape
Out[91]:
In [92]:
varRank_rfe_svm_l2_df = pd.DataFrame(ranking,index=X[1:])
In [93]:
varRank_rfe_svm_l2_df.to_csv("varRank_rfe_svm_l2_imp.csv")
In [94]:
from sklearn.feature_selection import SelectKBest
In [95]:
sbf = SelectKBest(k="all")
In [96]:
sbf.fit(xy_train_df[X[1:]].values, xy_train_df[y].values)
Out[96]:
In [99]:
sbf_score_pval_df = pd.DataFrame({"pval":sbf.pvalues_,"score":sbf.scores_},index=X[1:],columns=["pval","score"])
In [100]:
sbf_score_pval_df.head()
Out[100]:
In [101]:
sbf_score_pval_df.to_csv("score_sbf_imp.csv")
In [102]:
from sklearn.feature_selection import chi2
In [103]:
chi_model = chi2(xy_train_df[X[1:]].values, xy_train_df[y].values)
In [107]:
chi2_score_pval_df = pd.DataFrame({"pval":chi_model[1],"score":chi_model[0]},index=X[1:],columns=["pval","score"])
chi2_score_pval_df.head()
Out[107]:
In [108]:
chi2_score_pval_df.to_csv("score_chi2_imp.csv")
In [ ]: