In [1]:
import numpy as np
import pandas as pd
from IPython.display import Image
In [2]:
input_file = '../data/train_feature_sample.csv'
dtypes = {
'ip': 'uint32',
'app': 'uint16',
'device': 'uint16',
'os': 'uint16',
'channel': 'uint16',
'is_attributed': 'uint8'
}
In [3]:
df_train = pd.read_csv(input_file, dtype=dtypes)
X = df_train.drop(columns='click_time').drop(columns='id')
y = X.pop('is_attributed')
X.head()
Out[3]:
In [4]:
Image(url= "../img/filter-methods.png", width=600, height=600)
Out[4]:
In [5]:
len(X.columns)
Out[5]:
In [6]:
X.columns
Out[6]:
In [7]:
X_tmp = X.drop(columns='NUM_UNIQUE(clicks.YEAR(click_time))')
In [8]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_tmp = sel.fit_transform(X)
len(X_tmp[0])
Out[8]:
In [9]:
for x in zip(X.columns, sel.variances_):
print(f"{x[0]}: {x[1]}")
In [10]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
In [11]:
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, y)
In [12]:
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)
In [13]:
for x in zip(X.columns, fit.support_):
if x[1]:
print(f"{x[0]}: {x[1]}")
In [14]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty='l1')
sfm = SelectFromModel(model, threshold=0.05)
fit = sfm.fit(X, y)
X_tmp = sfm.transform(X)
n_features = X_tmp.shape[1]
In [15]:
n_features
Out[15]:
In [16]:
X.columns[fit.get_support()]
Out[16]:
In [17]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model = model.fit(X, y)
In [18]:
for x in zip(X.columns, model.feature_importances_):
print(f"{x[0]}: {x[1]}")
In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(n_estimators=400, n_jobs=-1)
scores = cross_val_score(estimator=clf,X=X, y=y, cv=3, scoring="roc_auc", verbose=True)
"AUC %.2f +/- %.2f" % (scores.mean(), scores.std())
Out[19]:
In [20]:
clf = clf.fit(X, y)
In [21]:
def feature_importances(model, features, n=10):
importances = model.feature_importances_
zipped = sorted(zip(features, importances), key=lambda x: -x[1])
for i, f in enumerate(zipped[:n]):
print("%d: Feature: %s, %.3f" % (i+1, f[0], f[1]))
return [f[0] for f in zipped[:n]]
top_features = feature_importances(clf, X, n=20)
In [22]:
# Install it first; restart kernel if needed
'''
git clone --recursive https://github.com/dmlc/xgboost
cd xgboost
make -j4
cd python-package
python setup.py develop --user
'''
Out[22]:
In [23]:
import xgboost as xgb
clf_xgBoost = xgb.XGBClassifier(
max_depth = 4,
subsample = 0.8,
colsample_bytree = 0.7,
colsample_bylevel = 0.7,
scale_pos_weight = 9,
min_child_weight = 0,
reg_alpha = 4,
n_jobs = 4,
objective = 'binary:logistic'
)
X_xgb = X.rename(columns=lambda x: x.replace(", ", "_"))
X_xgb = X_xgb.rename(columns=lambda x: x.replace("[", "_"))
X_xgb = X_xgb.rename(columns=lambda x: x.replace("]", "_"))
In [24]:
clf_xgBoost.fit(X_xgb, y)
Out[24]:
In [25]:
from sklearn import preprocessing
importance_dict = {}
for import_type in ['weight', 'gain', 'cover']:
importance_dict['xgBoost-'+import_type] = clf_xgBoost.get_booster().get_score(importance_type=import_type)
'''
‘weight’ - the number of times a feature is used to split the data across all trees.
‘gain’ - the average gain of the feature when it is used in trees
‘cover’ - the average coverage of the feature when it is used in trees
'''
importance_df = pd.DataFrame(importance_dict).fillna(0)
importance_df = pd.DataFrame(
preprocessing.MinMaxScaler().fit_transform(importance_df),
columns=importance_df.columns,
index=importance_df.index
)
importance_df['mean'] = importance_df.mean(axis=1)
In [27]:
importance_df.sort_values('mean').plot(kind='bar', figsize=(20, 7))
Out[27]:
In [ ]: