notebook.community

Edit and run



In [ ]:

    
# Create a method to find whether there is data shifting
## And which features caused data shifting

# Download the dataset here: https://www.kaggle.com/c/sberbank-russian-housing-market/data



In [2]:

    
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import LabelEncoder

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')



In [3]:

    
train.head()

# price_doc is the label









    Out[3]:







  
    
      
      id
      timestamp
      full_sq
      life_sq
      floor
      max_floor
      material
      build_year
      num_room
      kitch_sq
      ...
      cafe_count_5000_price_2500
      cafe_count_5000_price_4000
      cafe_count_5000_price_high
      big_church_count_5000
      church_count_5000
      mosque_count_5000
      leisure_count_5000
      sport_count_5000
      market_count_5000
      price_doc
    
  
  
    
      0
      1
      2011-08-20
      43
      27.0
      4.0
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      9
      4
      0
      13
      22
      1
      0
      52
      4
      5850000
    
    
      1
      2
      2011-08-23
      34
      19.0
      3.0
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      15
      3
      0
      15
      29
      1
      10
      66
      14
      6000000
    
    
      2
      3
      2011-08-27
      43
      29.0
      2.0
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      10
      3
      0
      11
      27
      0
      4
      67
      10
      5700000
    
    
      3
      4
      2011-09-01
      89
      50.0
      9.0
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      11
      2
      1
      4
      4
      0
      0
      26
      3
      13100000
    
    
      4
      5
      2011-09-05
      77
      77.0
      4.0
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      319
      108
      17
      135
      236
      2
      91
      195
      14
      16331452
    
  

5 rows × 292 columns



In [5]:

    
train.dtypes









    Out[5]:





id                                         int64
timestamp                                 object
full_sq                                    int64
life_sq                                  float64
floor                                    float64
max_floor                                float64
material                                 float64
build_year                               float64
num_room                                 float64
kitch_sq                                 float64
state                                    float64
product_type                              object
sub_area                                  object
area_m                                   float64
raion_popul                                int64
green_zone_part                          float64
indust_part                              float64
children_preschool                         int64
preschool_quota                          float64
preschool_education_centers_raion          int64
children_school                            int64
school_quota                             float64
school_education_centers_raion             int64
school_education_centers_top_20_raion      int64
hospital_beds_raion                      float64
healthcare_centers_raion                   int64
university_top_20_raion                    int64
sport_objects_raion                        int64
additional_education_raion                 int64
culture_objects_top_25                    object
                                          ...   
big_church_count_3000                      int64
church_count_3000                          int64
mosque_count_3000                          int64
leisure_count_3000                         int64
sport_count_3000                           int64
market_count_3000                          int64
green_part_5000                          float64
prom_part_5000                           float64
office_count_5000                          int64
office_sqm_5000                            int64
trc_count_5000                             int64
trc_sqm_5000                               int64
cafe_count_5000                            int64
cafe_sum_5000_min_price_avg              float64
cafe_sum_5000_max_price_avg              float64
cafe_avg_price_5000                      float64
cafe_count_5000_na_price                   int64
cafe_count_5000_price_500                  int64
cafe_count_5000_price_1000                 int64
cafe_count_5000_price_1500                 int64
cafe_count_5000_price_2500                 int64
cafe_count_5000_price_4000                 int64
cafe_count_5000_price_high                 int64
big_church_count_5000                      int64
church_count_5000                          int64
mosque_count_5000                          int64
leisure_count_5000                         int64
sport_count_5000                           int64
market_count_5000                          int64
price_doc                                  int64
Length: 292, dtype: object



In [6]:

    
test.head()









    Out[6]:







  
    
      
      id
      timestamp
      full_sq
      life_sq
      floor
      max_floor
      material
      build_year
      num_room
      kitch_sq
      ...
      cafe_count_5000_price_1500
      cafe_count_5000_price_2500
      cafe_count_5000_price_4000
      cafe_count_5000_price_high
      big_church_count_5000
      church_count_5000
      mosque_count_5000
      leisure_count_5000
      sport_count_5000
      market_count_5000
    
  
  
    
      0
      30474
      2015-07-01
      39.0
      20.7
      2
      9
      1
      1998.0
      1
      8.9
      ...
      8
      0
      0
      0
      1
      10
      1
      0
      14
      1
    
    
      1
      30475
      2015-07-01
      79.2
      NaN
      8
      17
      1
      0.0
      3
      1.0
      ...
      4
      1
      1
      0
      2
      11
      0
      1
      12
      1
    
    
      2
      30476
      2015-07-01
      40.5
      25.1
      3
      5
      2
      1960.0
      2
      4.8
      ...
      42
      11
      4
      0
      10
      21
      0
      10
      71
      11
    
    
      3
      30477
      2015-07-01
      62.8
      36.0
      17
      17
      1
      2016.0
      2
      62.8
      ...
      1
      1
      2
      0
      0
      10
      0
      0
      2
      0
    
    
      4
      30478
      2015-07-01
      40.0
      40.0
      17
      17
      1
      0.0
      1
      1.0
      ...
      5
      1
      1
      0
      2
      12
      0
      1
      11
      1
    
  

5 rows × 291 columns



In [7]:

    
# Preprocess data first, before checking data shifting

## Sometimes, the lowercase, uppercase for categorical data matters, here skip checking data consistency


## Deal with missing Data
for i in train.columns:
    if train[i].dtype == 'object':
        train[i] = train[i].fillna(train[i].mode().iloc[0])
    if (train[i].dtype == 'int64' or train[i].dtype == 'float64'):
        train[i] = train[i].fillna(np.nanmedian(train[i]))
    
for i in test.columns:
    if test[i].dtype == 'object':
        test[i] = test[i].fillna(test[i].mode().iloc[0])
    if (test[i].dtype == 'int64' or test[i].dtype == 'float64'):
        test[i] = test[i].fillna(np.nanmedian(test[i]))    
# with np.nanmedian, you can get median by ignoring NaN, otherwise, statistics.median could use NaN as median



In [8]:

    
train.head()









    Out[8]:







  
    
      
      id
      timestamp
      full_sq
      life_sq
      floor
      max_floor
      material
      build_year
      num_room
      kitch_sq
      ...
      cafe_count_5000_price_2500
      cafe_count_5000_price_4000
      cafe_count_5000_price_high
      big_church_count_5000
      church_count_5000
      mosque_count_5000
      leisure_count_5000
      sport_count_5000
      market_count_5000
      price_doc
    
  
  
    
      0
      1
      2011-08-20
      43
      27.0
      4.0
      12.0
      1.0
      1979.0
      2.0
      6.0
      ...
      9
      4
      0
      13
      22
      1
      0
      52
      4
      5850000
    
    
      1
      2
      2011-08-23
      34
      19.0
      3.0
      12.0
      1.0
      1979.0
      2.0
      6.0
      ...
      15
      3
      0
      15
      29
      1
      10
      66
      14
      6000000
    
    
      2
      3
      2011-08-27
      43
      29.0
      2.0
      12.0
      1.0
      1979.0
      2.0
      6.0
      ...
      10
      3
      0
      11
      27
      0
      4
      67
      10
      5700000
    
    
      3
      4
      2011-09-01
      89
      50.0
      9.0
      12.0
      1.0
      1979.0
      2.0
      6.0
      ...
      11
      2
      1
      4
      4
      0
      0
      26
      3
      13100000
    
    
      4
      5
      2011-09-05
      77
      77.0
      4.0
      12.0
      1.0
      1979.0
      2.0
      6.0
      ...
      319
      108
      17
      135
      236
      2
      91
      195
      14
      16331452
    
  

5 rows × 292 columns



In [9]:

    
train.timestamp.unique()









    Out[9]:





array(['2011-08-20', '2011-08-23', '2011-08-27', ..., '2015-06-27',
       '2015-06-29', '2015-06-30'], dtype=object)



In [10]:

    
## Categorical to Numerical (people call it label encoding, I think this name is confusing)
number = LabelEncoder()
for i in train.columns:
    if (train[i].dtype == 'object'):
        train[i] = number.fit_transform(train[i].astype('str'))
        train[i] = train[i].astype('object')

for i in test.columns:
    if (test[i].dtype == 'object'):
        test[i] = number.fit_transform(test[i].astype('str'))
        test[i] = test[i].astype('object')



In [11]:

    
train.head()









    Out[11]:







  
    
      
      id
      timestamp
      full_sq
      life_sq
      floor
      max_floor
      material
      build_year
      num_room
      kitch_sq
      ...
      cafe_count_5000_price_2500
      cafe_count_5000_price_4000
      cafe_count_5000_price_high
      big_church_count_5000
      church_count_5000
      mosque_count_5000
      leisure_count_5000
      sport_count_5000
      market_count_5000
      price_doc
    
  
  
    
      0
      1
      0
      43
      27.0
      4.0
      12.0
      1.0
      1979.0
      2.0
      6.0
      ...
      9
      4
      0
      13
      22
      1
      0
      52
      4
      5850000
    
    
      1
      2
      1
      34
      19.0
      3.0
      12.0
      1.0
      1979.0
      2.0
      6.0
      ...
      15
      3
      0
      15
      29
      1
      10
      66
      14
      6000000
    
    
      2
      3
      2
      43
      29.0
      2.0
      12.0
      1.0
      1979.0
      2.0
      6.0
      ...
      10
      3
      0
      11
      27
      0
      4
      67
      10
      5700000
    
    
      3
      4
      3
      89
      50.0
      9.0
      12.0
      1.0
      1979.0
      2.0
      6.0
      ...
      11
      2
      1
      4
      4
      0
      0
      26
      3
      13100000
    
    
      4
      5
      4
      77
      77.0
      4.0
      12.0
      1.0
      1979.0
      2.0
      6.0
      ...
      319
      108
      17
      135
      236
      2
      91
      195
      14
      16331452
    
  

5 rows × 292 columns



In [35]:

    
## create this new col "origin" to tell which set comes from train, which comes from test
train['origin'] = 0
test['origin'] = 1
training = train.drop('price_doc',axis=1) # drop the label



In [36]:

    
## taking sample from training and test data
training = training.sample(9000, random_state=77)
testing = test.sample(7000, random_state=99)



In [37]:

    
## combining random samples, seperate 'origin' col with the data
combi = training.append(testing)
y = combi['origin']
combi.drop('origin',axis=1,inplace=True)



In [40]:

    
# Find drifting features can be dropped, put into drop_list
## If a feature has average score > 0.8, put into drop_list. 
## Because it means the feature could decide it's "train" or "test"

model = RandomForestClassifier(n_estimators = 50, max_depth = 5, min_samples_leaf = 5)
drop_list = []

for i in combi.columns:
    score = cross_val_score(model,pd.DataFrame(combi[i]),y,cv=5,scoring='roc_auc')
    if (np.mean(score) > 0.8):
        drop_list.append(i)
        print(i,np.mean(score))









    



('id', 1.0)
('timestamp', 0.95935706349206351)
('kitch_sq', 0.89323138888888887)
('cafe_sum_500_min_price_avg', 0.83561873015873012)
('cafe_avg_price_500', 0.83495988095238105)



In [42]:

    
# How to treat drifting features in order to improve final model prediction

training = train.drop('origin',axis=1)
testing = test.drop('origin',axis=1)

# using regressor here is because of the label is numerical
rf = RandomForestRegressor(n_estimators=200, max_depth=4,max_features=10)

rf.fit(training.drop('price_doc',axis=1),training['price_doc'])   # it ignores id automatically
pred = rf.predict(testing)



In [43]:

    
# Dropping drifting features but check feature importance first
## plotting feature importance (top 10) and find whether there is drifting features are important
features = training.columns.values
imp = rf.feature_importances_
indices = np.argsort(imp)[::-1][:10]

#plot
plt.figure(figsize=(7,9))
plt.bar(range(len(indices)), imp[indices], color = 'purple', align='center')
plt.xticks(range(len(indices)), features[indices], rotation='vertical')
plt.xlim([-1,len(indices)])
plt.show()

# kitch_sq is important, although it's showing drifting, keep this feature



In [45]:

    
## dropping drifting features which are not important.
drift_train = training.drop(['id','cafe_sum_500_min_price_avg','cafe_avg_price_500'], axis=1)
drift_test = testing.drop(['id','cafe_sum_500_min_price_avg','cafe_avg_price_500'], axis=1)

rf = RandomForestRegressor(n_estimators=200, max_depth=4,max_features=10)
rf.fit(drift_train.drop('price_doc',axis=1),training['price_doc'])

# submit your result to Kaggle: https://www.kaggle.com/c/sberbank-russian-housing-market/data
pred = rf.predict(drift_test)
columns = ['price_doc']
sub = pd.DataFrame(data=pred,columns=columns)
sub['id'] = test['id']
sub = sub[['id','price_doc']]
sub.to_csv('without_drifting.csv', index=False)









    Out[45]:





RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=4,
           max_features=10, max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)



In [7]:

    
# method 2 - MLBox

from mlbox.preprocessing import *



In [8]:

    
train.head()   # without cleaning, mlbox does cleanning for you









    Out[8]:







  
    
      
      id
      timestamp
      full_sq
      life_sq
      floor
      max_floor
      material
      build_year
      num_room
      kitch_sq
      ...
      cafe_count_5000_price_2500
      cafe_count_5000_price_4000
      cafe_count_5000_price_high
      big_church_count_5000
      church_count_5000
      mosque_count_5000
      leisure_count_5000
      sport_count_5000
      market_count_5000
      price_doc
    
  
  
    
      0
      1
      2011-08-20
      43
      27.0
      4.0
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      9
      4
      0
      13
      22
      1
      0
      52
      4
      5850000
    
    
      1
      2
      2011-08-23
      34
      19.0
      3.0
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      15
      3
      0
      15
      29
      1
      10
      66
      14
      6000000
    
    
      2
      3
      2011-08-27
      43
      29.0
      2.0
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      10
      3
      0
      11
      27
      0
      4
      67
      10
      5700000
    
    
      3
      4
      2011-09-01
      89
      50.0
      9.0
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      11
      2
      1
      4
      4
      0
      0
      26
      3
      13100000
    
    
      4
      5
      2011-09-05
      77
      77.0
      4.0
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      319
      108
      17
      135
      236
      2
      91
      195
      14
      16331452
    
  

5 rows × 292 columns



In [15]:

    
cols = train.columns
cols









    Out[15]:





Index(['id', 'timestamp', 'full_sq', 'life_sq', 'floor', 'max_floor',
       'material', 'build_year', 'num_room', 'kitch_sq',
       ...
       'cafe_count_5000_price_2500', 'cafe_count_5000_price_4000',
       'cafe_count_5000_price_high', 'big_church_count_5000',
       'church_count_5000', 'mosque_count_5000', 'leisure_count_5000',
       'sport_count_5000', 'market_count_5000', 'price_doc'],
      dtype='object', length=292)



In [10]:

    
data = Reader(sep=",").train_test_split(["train.csv", "test.csv"], "price_doc")









    



reading csv : train.csv ...
cleaning data...
CPU time: 38.210623025894165 seconds

reading csv : test.csv ...
cleaning data...
CPU time: 1.2265539169311523 seconds

number of common features : 294

gathering and crunching for train and test datasets
reindexing for train and test datasets
dropping training duplicates
dropping constant variables on training set

number of categorical features : 15
number of numerical features : 279
number of training samples : 30471
number of test samples : 7662

Top sparse features (% missing values on train set):
hospital_beds_raion           47.4
build_year                    44.6
state                         44.5
cafe_avg_price_500            43.6
cafe_sum_500_max_price_avg    43.6
dtype: float64

task : regression



In [12]:

    
#deleting non-stable/shifting features
## 0.8 is the default threshold

data = Drift_thresholder().fit_transform(data)









    



computing drifts...
CPU time: 4.431854009628296 seconds

Top 10 drifts

('id', 1.0)
('timestamp_TIMESTAMP', 1.0)
('kitch_sq', 0.85066615773743659)
('state', 0.69643064512586084)
('life_sq', 0.68690434781272502)
('material', 0.67251970247347859)
('build_year', 0.6713825597594798)
('max_floor', 0.66605720777439148)
('num_room', 0.66429234043615915)
('full_sq', 0.65005102278248572)

deleted variables : ['id', 'kitch_sq', 'timestamp_TIMESTAMP']

dumping drift coefficients into directory : save
drift coefficients dumped

	id	timestamp	full_sq	life_sq	floor	max_floor	material	build_year	num_room	kitch_sq	...	cafe_count_5000_price_2500	cafe_count_5000_price_4000	cafe_count_5000_price_high	big_church_count_5000	church_count_5000	mosque_count_5000	leisure_count_5000	sport_count_5000	market_count_5000	price_doc
0	1	2011-08-20	43	27.0	4.0	NaN	NaN	NaN	NaN	NaN	...	9	4	0	13	22	1	0	52	4	5850000
1	2	2011-08-23	34	19.0	3.0	NaN	NaN	NaN	NaN	NaN	...	15	3	0	15	29	1	10	66	14	6000000
2	3	2011-08-27	43	29.0	2.0	NaN	NaN	NaN	NaN	NaN	...	10	3	0	11	27	0	4	67	10	5700000
3	4	2011-09-01	89	50.0	9.0	NaN	NaN	NaN	NaN	NaN	...	11	2	1	4	4	0	0	26	3	13100000
4	5	2011-09-05	77	77.0	4.0	NaN	NaN	NaN	NaN	NaN	...	319	108	17	135	236	2	91	195	14	16331452

	id	timestamp	full_sq	life_sq	floor	max_floor	material	build_year	num_room	kitch_sq	...	cafe_count_5000_price_1500	cafe_count_5000_price_2500	cafe_count_5000_price_4000	big_church_count_5000	church_count_5000	mosque_count_5000	leisure_count_5000	sport_count_5000	market_count_5000
0	30474	2015-07-01	39.0	20.7	2	9	1	1998.0	1	8.9	...	8	0	0	1	10	1	0	14	1
1	30475	2015-07-01	79.2	NaN	8	17	1	0.0	3	1.0	...	4	1	1	2	11	0	1	12	1
2	30476	2015-07-01	40.5	25.1	3	5	2	1960.0	2	4.8	...	42	11	4	10	21	0	10	71	11
3	30477	2015-07-01	62.8	36.0	17	17	1	2016.0	2	62.8	...	1	1	2	0	10	0	0	2	0
4	30478	2015-07-01	40.0	40.0	17	17	1	0.0	1	1.0	...	5	1	1	2	12	0	1	11	1

	id	timestamp	full_sq	life_sq	floor	max_floor	material	build_year	num_room	kitch_sq	...	cafe_count_5000_price_2500	cafe_count_5000_price_4000	cafe_count_5000_price_high	big_church_count_5000	church_count_5000	mosque_count_5000	leisure_count_5000	sport_count_5000	market_count_5000	price_doc
0	1	2011-08-20	43	27.0	4.0	12.0	1.0	1979.0	2.0	6.0	...	9	4	0	13	22	1	0	52	4	5850000
1	2	2011-08-23	34	19.0	3.0	12.0	1.0	1979.0	2.0	6.0	...	15	3	0	15	29	1	10	66	14	6000000
2	3	2011-08-27	43	29.0	2.0	12.0	1.0	1979.0	2.0	6.0	...	10	3	0	11	27	0	4	67	10	5700000
3	4	2011-09-01	89	50.0	9.0	12.0	1.0	1979.0	2.0	6.0	...	11	2	1	4	4	0	0	26	3	13100000
4	5	2011-09-05	77	77.0	4.0	12.0	1.0	1979.0	2.0	6.0	...	319	108	17	135	236	2	91	195	14	16331452