In [1]:
import pandas as pd
from scipy import stats
import numpy as np # linear algebra
import sklearn
import gc
from sklearn.preprocessing import StandardScaler
import xgboost as xgb


D:\Program Files\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
#test every feature function
#Removing features with low variance
from sklearn.feature_selection import VarianceThreshold
#Univariate feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
# L1-based feature selection
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
#Tree-based feature selection
from sklearn.ensemble import ExtraTreesClassifier


#Feature selection as part of a pipeline

In [3]:
prop=pd.read_csv('D:\\data\\properties_2016.csv', dtype={'hashottuborspa':'bool','propertycountylandusecode':'object','propertyzoningdesc':'object', 'fireplaceflag':'bool', 'taxdelinquencyflag':'object'})
train=pd.read_csv('D:\\data\\train_2016.csv')
for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

In [5]:
df_train = train.merge(prop, how='left', on='parcelid')

In [6]:
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 
                         'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values

In [7]:
for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

In [8]:
mean=x_train.mean(axis=0)
x_train.fillna(mean, inplace=True)
print('fill nan')


fill nan

In [9]:
x_train.shape
#origin mae best [231]	train-mae:0.065133	valid-mae:0.073991


Out[9]:
(90275, 55)

In [15]:
#Removing features with low variance
sel = VarianceThreshold(threshold=(0.02))
se_x = sel.fit_transform(x_train)
se_x.shape
#xgb mae best [217]	train-mae:0.065649	valid-mae:0.074148


Out[15]:
(90000, 15)

In [12]:
#Univariate feature selection
se_x = SelectKBest(f_regression, k=40).fit_transform(x_train, y_train)
se_x.shape
#kernel dump


Out[12]:
(90000, 40)

In [12]:
#Tree-based feature selection
from sklearn import ensemble
clf = ensemble.RandomForestRegressor(n_estimators=100,n_jobs=17)
clf = clf.fit(x_train, y_train)
model = SelectFromModel(clf, prefit=True)
se_x = model.transform(x_train)
se_x.shape
#xgb mae best [222]	train-mae:0.065165	valid-mae:0.074297


Out[12]:
(90275, 15)

In [16]:
split = 90000
x_train, y_train, x_valid, y_valid = se_x[:split], y_train[:split], se_x[split:], y_train[split:]
x_train = x_train.astype(np.float32, copy=False)
x_valid = x_valid.astype(np.float32, copy=False)

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]

In [ ]:
params = {}
params['eta'] = 0.02
params['objective'] = 'reg:linear'
params['eval_metric'] = 'mae'
params['max_depth'] = 10
params['silent'] = 0
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)

In [ ]: