In [1]:
import pandas as pd
from scipy import stats
import numpy as np # linear algebra
import sklearn
import gc
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
In [2]:
#test every feature function
#Removing features with low variance
from sklearn.feature_selection import VarianceThreshold
#Univariate feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
# L1-based feature selection
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
#Tree-based feature selection
from sklearn.ensemble import ExtraTreesClassifier
#Feature selection as part of a pipeline
In [3]:
prop=pd.read_csv('D:\\data\\properties_2016.csv', dtype={'hashottuborspa':'bool','propertycountylandusecode':'object','propertyzoningdesc':'object', 'fireplaceflag':'bool', 'taxdelinquencyflag':'object'})
train=pd.read_csv('D:\\data\\train_2016.csv')
for c, dtype in zip(prop.columns, prop.dtypes):
if dtype == np.float64:
prop[c] = prop[c].astype(np.float32)
In [5]:
df_train = train.merge(prop, how='left', on='parcelid')
In [6]:
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate',
'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
In [7]:
for c in x_train.dtypes[x_train.dtypes == object].index.values:
x_train[c] = (x_train[c] == True)
In [8]:
mean=x_train.mean(axis=0)
x_train.fillna(mean, inplace=True)
print('fill nan')
In [9]:
x_train.shape
#origin mae best [231] train-mae:0.065133 valid-mae:0.073991
Out[9]:
In [15]:
#Removing features with low variance
sel = VarianceThreshold(threshold=(0.02))
se_x = sel.fit_transform(x_train)
se_x.shape
#xgb mae best [217] train-mae:0.065649 valid-mae:0.074148
Out[15]:
In [12]:
#Univariate feature selection
se_x = SelectKBest(f_regression, k=40).fit_transform(x_train, y_train)
se_x.shape
#kernel dump
Out[12]:
In [12]:
#Tree-based feature selection
from sklearn import ensemble
clf = ensemble.RandomForestRegressor(n_estimators=100,n_jobs=17)
clf = clf.fit(x_train, y_train)
model = SelectFromModel(clf, prefit=True)
se_x = model.transform(x_train)
se_x.shape
#xgb mae best [222] train-mae:0.065165 valid-mae:0.074297
Out[12]:
In [16]:
split = 90000
x_train, y_train, x_valid, y_valid = se_x[:split], y_train[:split], se_x[split:], y_train[split:]
x_train = x_train.astype(np.float32, copy=False)
x_valid = x_valid.astype(np.float32, copy=False)
d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
In [ ]:
params = {}
params['eta'] = 0.02
params['objective'] = 'reg:linear'
params['eval_metric'] = 'mae'
params['max_depth'] = 10
params['silent'] = 0
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)
In [ ]: