notebook.community

Edit and run



In [1]:

    
import pandas as pd
from scipy import stats
import numpy as np # linear algebra
import sklearn
import gc
from sklearn.preprocessing import StandardScaler
import xgboost as xgb









    



D:\Program Files\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [2]:

    
#test every feature function
#Removing features with low variance
from sklearn.feature_selection import VarianceThreshold
#Univariate feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
# L1-based feature selection
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
#Tree-based feature selection
from sklearn.ensemble import ExtraTreesClassifier


#Feature selection as part of a pipeline



In [3]:

    
prop=pd.read_csv('D:\\data\\properties_2016.csv', dtype={'hashottuborspa':'bool','propertycountylandusecode':'object','propertyzoningdesc':'object', 'fireplaceflag':'bool', 'taxdelinquencyflag':'object'})
train=pd.read_csv('D:\\data\\train_2016.csv')
for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)



In [5]:

    
df_train = train.merge(prop, how='left', on='parcelid')



In [6]:

    
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 
                         'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values



In [7]:

    
for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)



In [8]:

    
mean=x_train.mean(axis=0)
x_train.fillna(mean, inplace=True)
print('fill nan')









    



fill nan



In [9]:

    
x_train.shape
#origin mae best [231]	train-mae:0.065133	valid-mae:0.073991









    Out[9]:





(90275, 55)



In [15]:

    
#Removing features with low variance
sel = VarianceThreshold(threshold=(0.02))
se_x = sel.fit_transform(x_train)
se_x.shape
#xgb mae best [217]	train-mae:0.065649	valid-mae:0.074148









    Out[15]:





(90000, 15)



In [12]:

    
#Univariate feature selection
se_x = SelectKBest(f_regression, k=40).fit_transform(x_train, y_train)
se_x.shape
#kernel dump









    Out[12]:





(90000, 40)



In [12]:

    
#Tree-based feature selection
from sklearn import ensemble
clf = ensemble.RandomForestRegressor(n_estimators=100,n_jobs=17)
clf = clf.fit(x_train, y_train)
model = SelectFromModel(clf, prefit=True)
se_x = model.transform(x_train)
se_x.shape
#xgb mae best [222]	train-mae:0.065165	valid-mae:0.074297









    Out[12]:





(90275, 15)



In [16]:

    
split = 90000
x_train, y_train, x_valid, y_valid = se_x[:split], y_train[:split], se_x[split:], y_train[split:]
x_train = x_train.astype(np.float32, copy=False)
x_valid = x_valid.astype(np.float32, copy=False)

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]



In [ ]:

    
params = {}
params['eta'] = 0.02
params['objective'] = 'reg:linear'
params['eval_metric'] = 'mae'
params['max_depth'] = 10
params['silent'] = 0
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)



In [ ]: