In [ ]:
# Create a method to find whether there is data shifting
## And which features caused data shifting
# Download the dataset here: https://www.kaggle.com/c/sberbank-russian-housing-market/data
In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import LabelEncoder
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [3]:
train.head()
# price_doc is the label
Out[3]:
In [5]:
train.dtypes
Out[5]:
In [6]:
test.head()
Out[6]:
In [7]:
# Preprocess data first, before checking data shifting
## Sometimes, the lowercase, uppercase for categorical data matters, here skip checking data consistency
## Deal with missing Data
for i in train.columns:
if train[i].dtype == 'object':
train[i] = train[i].fillna(train[i].mode().iloc[0])
if (train[i].dtype == 'int64' or train[i].dtype == 'float64'):
train[i] = train[i].fillna(np.nanmedian(train[i]))
for i in test.columns:
if test[i].dtype == 'object':
test[i] = test[i].fillna(test[i].mode().iloc[0])
if (test[i].dtype == 'int64' or test[i].dtype == 'float64'):
test[i] = test[i].fillna(np.nanmedian(test[i]))
# with np.nanmedian, you can get median by ignoring NaN, otherwise, statistics.median could use NaN as median
In [8]:
train.head()
Out[8]:
In [9]:
train.timestamp.unique()
Out[9]:
In [10]:
## Categorical to Numerical (people call it label encoding, I think this name is confusing)
number = LabelEncoder()
for i in train.columns:
if (train[i].dtype == 'object'):
train[i] = number.fit_transform(train[i].astype('str'))
train[i] = train[i].astype('object')
for i in test.columns:
if (test[i].dtype == 'object'):
test[i] = number.fit_transform(test[i].astype('str'))
test[i] = test[i].astype('object')
In [11]:
train.head()
Out[11]:
In [35]:
## create this new col "origin" to tell which set comes from train, which comes from test
train['origin'] = 0
test['origin'] = 1
training = train.drop('price_doc',axis=1) # drop the label
In [36]:
## taking sample from training and test data
training = training.sample(9000, random_state=77)
testing = test.sample(7000, random_state=99)
In [37]:
## combining random samples, seperate 'origin' col with the data
combi = training.append(testing)
y = combi['origin']
combi.drop('origin',axis=1,inplace=True)
In [40]:
# Find drifting features can be dropped, put into drop_list
## If a feature has average score > 0.8, put into drop_list.
## Because it means the feature could decide it's "train" or "test"
model = RandomForestClassifier(n_estimators = 50, max_depth = 5, min_samples_leaf = 5)
drop_list = []
for i in combi.columns:
score = cross_val_score(model,pd.DataFrame(combi[i]),y,cv=5,scoring='roc_auc')
if (np.mean(score) > 0.8):
drop_list.append(i)
print(i,np.mean(score))
In [42]:
# How to treat drifting features in order to improve final model prediction
training = train.drop('origin',axis=1)
testing = test.drop('origin',axis=1)
# using regressor here is because of the label is numerical
rf = RandomForestRegressor(n_estimators=200, max_depth=4,max_features=10)
rf.fit(training.drop('price_doc',axis=1),training['price_doc']) # it ignores id automatically
pred = rf.predict(testing)
In [43]:
# Dropping drifting features but check feature importance first
## plotting feature importance (top 10) and find whether there is drifting features are important
features = training.columns.values
imp = rf.feature_importances_
indices = np.argsort(imp)[::-1][:10]
#plot
plt.figure(figsize=(7,9))
plt.bar(range(len(indices)), imp[indices], color = 'purple', align='center')
plt.xticks(range(len(indices)), features[indices], rotation='vertical')
plt.xlim([-1,len(indices)])
plt.show()
# kitch_sq is important, although it's showing drifting, keep this feature
In [45]:
## dropping drifting features which are not important.
drift_train = training.drop(['id','cafe_sum_500_min_price_avg','cafe_avg_price_500'], axis=1)
drift_test = testing.drop(['id','cafe_sum_500_min_price_avg','cafe_avg_price_500'], axis=1)
rf = RandomForestRegressor(n_estimators=200, max_depth=4,max_features=10)
rf.fit(drift_train.drop('price_doc',axis=1),training['price_doc'])
# submit your result to Kaggle: https://www.kaggle.com/c/sberbank-russian-housing-market/data
pred = rf.predict(drift_test)
columns = ['price_doc']
sub = pd.DataFrame(data=pred,columns=columns)
sub['id'] = test['id']
sub = sub[['id','price_doc']]
sub.to_csv('without_drifting.csv', index=False)
Out[45]:
In [7]:
# method 2 - MLBox
from mlbox.preprocessing import *
In [8]:
train.head() # without cleaning, mlbox does cleanning for you
Out[8]:
In [15]:
cols = train.columns
cols
Out[15]:
In [10]:
data = Reader(sep=",").train_test_split(["train.csv", "test.csv"], "price_doc")
In [12]:
#deleting non-stable/shifting features
## 0.8 is the default threshold
data = Drift_thresholder().fit_transform(data)