This notebook handles the non-bayesian data prep. I had a bit of a love/hate relationship with it, until I did out-of-bag testing and found it doesn't actually increase overfitting.


In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import pickle

import sklearn.cluster

import Levenshtein

from multiprocessing import Pool

In [2]:
# lightgbm can't handle NaN's, but with pd.fillna it's not *this* program's problem
MISSING = np.nan
# MISSING = -99999

In [3]:
#input data
train_df=pd.read_json('../input/train.json')
test_df=pd.read_json('../input/test.json')

train_df.reset_index(inplace=True)

In [4]:
# determined to be broken after looking at results from a prior run of a submodel.  1025 is in it's Description.
train_df.loc[train_df.listing_id == 7122037, 'price'] = 1025

In [5]:
target_num_map = {'low':0, 'medium':1, 'high':2}
train_df['interest_cat'] = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

target_num_map_reg = {'low':0, 'medium': (.5 + (9/13)) / 2.0, 'high':1}
train_df['interest'] = np.array(train_df['interest_level'].apply(lambda x: target_num_map_reg[x]))

In [6]:
from sklearn.cluster import Birch

n_clusters = 92

# modified from https://www.kaggle.com/luisblanche/two-sigma-connect-rental-listing-inquiries/price-compared-to-neighborhood-median/run/1011514

def latlong_in_city(data):
    return (data.longitude>-74.05)&(data.longitude<-73.75)&(data.latitude>40.4)&(data.latitude<40.9)

data_c=train_df[latlong_in_city(train_df)].copy()
data_e=train_df[~latlong_in_city(train_df)].copy()

coords_c=data_c.as_matrix(columns=['latitude', "longitude"])

brc = Birch(branching_factor=100, n_clusters=n_clusters, threshold=0.01,compute_labels=True)

brc.fit(coords_c)

coords_tr=train_df.as_matrix(columns=['latitude', "longitude"])
coords_te=test_df.as_matrix(columns=['latitude', "longitude"])

train_df['location_cluster'] = brc.predict(coords_tr)
test_df['location_cluster'] = brc.predict(coords_te)

train_df.loc[~latlong_in_city(train_df), 'location_cluster'] = -1
test_df.loc[~latlong_in_city(test_df), 'location_cluster'] = -1

In [7]:
imean = train_df.interest.mean()

In [8]:
chisum = 0
for key in ['location_cluster']:
    for g in train_df.groupby(key):
        if len(g[1]) > 20:
            chi = ((g[1].interest.mean() - imean) ** 2.0) * len(g[1])
            chisum += chi
            #print(g[0], len(g[1]), chi, g[1].interest.mean())
    
print(chisum)


195.02912068367883

In [9]:
# https://www.kaggle.com/ivanoliveri/two-sigma-connect-rental-listing-inquiries/new-features-from-addresses-fields

def get_leven_ratio_row(row):
    return Levenshtein.ratio(row.display_address.lower(), row.street_address.lower())

def get_leven_ratio(df):
    return df.apply(get_leven_ratio_row, axis=1)

with Pool(2) as pool:
    rv = pool.map(get_leven_ratio, [train_df, test_df])

train_df['address_ratio'] = rv[0]
test_df['address_ratio'] = rv[1]

In [10]:
# The infamous leak

image_time = pd.read_csv('../input/listing_image_time.csv')

train_df = pd.merge(train_df, image_time, left_on='listing_id', right_on='Listing_Id')
test_df = pd.merge(test_df, image_time, left_on='listing_id', right_on='Listing_Id')

In [11]:
# mostly from a few different public scripts

def preproc_df(train_df):
    #train_df["price"] = train_df["price"].clip(upper=13000)
    train_df["price_t"] =train_df["price"]/train_df["bedrooms"]
    train_df["room_sum"] = train_df["bedrooms"]+train_df["bathrooms"] 
    train_df['price_per_room'] = train_df['price']/train_df['room_sum']
    
    train_df['half_bathroom'] = train_df['bathrooms'] - np.floor(train_df['bathrooms'])

    train_df["num_photos"] = train_df["photos"].apply(len)
    train_df["num_features"] = train_df["features"].apply(len)
    train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
    train_df["description_length"] = train_df["description"].apply(lambda x: len(x))

    train_df["created"] = pd.to_datetime(train_df["created"])
    train_df["created_year"] = train_df["created"].dt.year
    train_df["created_month"] = train_df["created"].dt.month
    train_df["created_day"] = train_df["created"].dt.day
    train_df["created_dayofyear"] = train_df["created"].dt.dayofyear
    train_df["created_hour"] = train_df["created"].dt.hour
    train_df["created_epoch"] = pd.DatetimeIndex(train_df.created).astype(np.int64) // 1000000000
    
#    train_df["listing_div_day"] = train_df["created"].dt.hour

    train_df["pos"] = train_df.longitude.round(3).astype(str) + '_' + train_df.latitude.round(3).astype(str)
    #train_df["pos2"] = train_df.longitude.round(3).astype(str) + '_' + train_df.latitude.round(3).astype(str)

    return train_df
    
train_df = preproc_df(train_df)    
test_df = preproc_df(test_df)    
    
vals = train_df['pos'].value_counts()
dvals = vals.to_dict()
train_df["density"] = train_df['pos'].apply(lambda x: dvals.get(x, vals.min()))
test_df["density"] = test_df['pos'].apply(lambda x: dvals.get(x, vals.min()))

In [12]:
base_features = ["address_ratio", "location_cluster","bathrooms", "bedrooms", "half_bathroom",
                 "latitude", "longitude", "price","price_t","price_per_room", "density",
                 "num_photos", "num_features", "num_description_words","listing_id", 
                 "created_year", "created_dayofyear", "created_month", "created_day", "created_hour", 'time_stamp']

Improved version of TF-IDF processing from https://www.kaggle.com/sudalairajkumar/two-sigma-connect-rental-listing-inquiries/xgb-starter-in-python

My version does more precleaning and results in improved output quality.


In [13]:
# correct punctuation etc for features/prep for TF-IDFing
def featurefixer(l):
    rv = ''
    
    for f in l:
        s = f.lower()
        s = s.replace(' ', '_')
        s = s.replace('-', '_')
        s = s.replace('/', '_')
        s = s.replace('*', ' ')
        
        rv += s + ' '
        
    return rv


train_df['features'] = train_df["features"].apply(featurefixer)
test_df['features'] = test_df["features"].apply(featurefixer)

In [14]:
print(train_df["features"].head())

tfidf = CountVectorizer(stop_words='english', max_features=200)
tfidf.fit(train_df["features"])
#te_sparse = tfidf.transform(test_df["features"])


0                                                     
1    doorman elevator fitness_center cats_allowed d...
2    laundry_in_building dishwasher hardwood_floors...
3                              hardwood_floors no_fee 
4                                             pre_war 
Name: features, dtype: object
Out[14]:
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=200, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [15]:
tfidf_train = tfidf.transform(train_df['features']).toarray()
tfidf_test = tfidf.transform(test_df['features']).toarray()

tfidf_fn = ['f_{0}'.format(f) for f in tfidf.get_feature_names()]

pd_tfidf_train = pd.DataFrame(tfidf_train, columns=tfidf_fn, index=train_df.index)
pd_tfidf_test = pd.DataFrame(tfidf_test, columns=tfidf_fn, index=test_df.index)

# this was filtered out by chi squared measurements elsewhere... which might or might not be a great idea.  eh.

mergelist = [
#    [('f_central_ac', 'f_central_air', 'f_central_a_c'), 'f_central_ac'],
#    [('f_hi_rise', 'f_highrise'), 'fm_highrise'],
    [('f_ft_doorman', 'f_full_time_doorman',), 'f_doorman'],
#    [('f_washer_in_unit', 'f_dryer_in_unit', 'f_washer_dryer_in_unit', 'f_in_unit_washer_dryer'), 'fm_laundry_in_unit'],
#    [('f_concierge', 'f_concierge_service', 'f_24_7_concierge'), 'fm_concierge'],
#    [('f_roofdeck', 'f_roof_deck', 'f_rooftop_deck'), 'fm_roofdeck'],
    [('f_laundry_',), 'f_laundry'],
#    [('f_washer_dryer_in_building', 'f_private_laundry_room_on_every_floor', 'f_on_site_laundry', 'f_laundry_room', 'f_laundry_in_building'), 'fm_laundry_in_building'],
#    [('f_live_in_super',), 'f_live_in_superintendent'],
#    [('f_prewar','f__ornate_prewar_details_'), 'f_pre_war'],
    [('f_valet_services_including_dry_cleaning', 'f_valet_services'), 'f_valet'],
    [('f_wheelchair_ramp',), 'f_wheelchair_access'],
    [('f_high_ceiling',), 'f_high_ceilings'],
    [('f_terraces___balconies',), 'f_terrace'],
    [('f__dishwasher_',), 'f_dishwasher'],
    [('f_decorative_fireplace', 'f_fireplaces'), 'f_fireplace'],
#    [('f_common_parking_garage', 'f_full_service_garage', 'f_garage', 'f_on_site_garage'), 'fm_garage'],
    [('f_private_outdoor_space', 'f_common_outdoor_space'), 'f_outdoor_space'],
    [('f__elev_bldg_',), 'f_elevator']
    
]

def run_mergelist(df, mergelist):
    for m in mergelist:
        #print(m, m[1])
        
        if m[1] not in df:
            df[m[1]] = 0

        for merge in m[0]:
            #print('X ', merge, m[1])
            df[m[1]] |= df[merge]

            df.drop(merge, axis=1, inplace=True)
                
    return df
            
pd_tfidf_train = run_mergelist(pd_tfidf_train, mergelist)
pd_tfidf_test = run_mergelist(pd_tfidf_test, mergelist)

tfidf_fn = list(pd_tfidf_train.keys())

train_df = pd.merge(train_df, pd_tfidf_train, left_index=True, right_index=True)
test_df = pd.merge(test_df, pd_tfidf_test, left_index=True, right_index=True)

train_df['d_lower'] = train_df.description.apply(lambda s: s.lower())
test_df['d_lower'] = test_df.description.apply(lambda s: s.lower())

In [16]:
def printchi(df, key):
    imean = df.interest.mean()
    for k in key:
        subset = df[df[k] > 0]
        chi = ((subset.interest.mean() - imean) ** 2.0) * len(subset)
        print(k, len(subset), chi)


descmap = [
    [('exposed brick',), 'fd_exposed_brick'],
    [('fireplace',), 'fd_fireplace'],
    #[('doorman',), 'f_doorman'],
    [('microwave',), 'fd_microwave'],
    [('laundry in unit', 'washer dryer inside',), 'fd_laundry_in_unit'],
    [('dishwasher',), 'fd_dishwasher'],
    [('no fee',), 'fd_no_fee'],
    [('subway',), 'fd_subway'],
]

fd_features = [m[1] for m in descmap]

def backfill(df):
    for m in descmap:
        df[m[1]] = 0
        for keyword in m[0]:
            nv = df.d_lower.apply(lambda x: x.find(keyword) >= 0)
            df[m[1]] |= nv

        df[m[1]] = df[m[1]].astype(np.uint8)

backfill(train_df)
backfill(test_df)

In [17]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            base_features.append(f)

In [18]:
# primitive version of bedroom/location price grouper (-submodel-medium does much more work than this!)
train_high = train_df[train_df.interest_level == 'medium']

price_group = {}

for g in train_high.groupby(['bedrooms', 'location_cluster']):
    if len(g[1]) < 10:
        continue
        
    #print(g[0], g[1].price.mean())
    price_group[g[0]] = g[1].price.mean()

def apply_group(df):
    df['price_group'] = MISSING
    
    for g in df.groupby(['bedrooms', 'location_cluster']):
        if g[0] in price_group:
            df.loc[g[1].index, 'price_group'] = price_group[g[0]]
            
    df['price_ratio'] = df['price'] / df['price_group']
    
    return df

train_df = apply_group(train_df)
test_df = apply_group(test_df)

In [19]:
# non-baynesian manager features.  this tries to pick out 'lazy' managers that don't write enough
# descriptions and don't assign building ID's properly, etc.

def apply_avg(df, means, key, clip = None):
    df[key] = MISSING # can't use NaN's with lightgbm yet
    
    for m in means:
        if clip is not None:
            v = m[1] > clip
            
        df.loc[df['manager_id'] == m[0], key] = m[1]
    
means_dl = []
means_b0 = []
means_f0 = []
median_price = []

for m in train_df[['manager_id', 'description_length', 'building_id', 'num_features', 'price']].groupby('manager_id'):
    if len(m[1]) < 5:
        continue
        
    means_dl.append((m[0], (m[1].description_length <= 8).mean()))
    means_b0.append((m[0], (m[1].building_id == 0).mean()))
    means_f0.append((m[0], (m[1].num_features == 0).mean()))
    median_price.append((m[0], m[1].price.median()))
    
for df in [train_df, test_df]:
    apply_avg(df, means_dl, 'manager_shortdesc_rate', 0.25)
    apply_avg(df, means_b0, 'manager_building0_rate', 0.77)
    apply_avg(df, means_f0, 'manager_0feature_rate')
    apply_avg(df, median_price, 'manager_median_price')
    df['manager_lazy_rate'] = np.clip(df.manager_shortdesc_rate + df.manager_building0_rate, 0, 1)

I did a few different kernel density functions, and settled on these two as most effective


In [20]:
from sklearn.neighbors.kde import KernelDensity

kde01e = KernelDensity(kernel='exponential', bandwidth=0.01).fit(train_df[['latitude', 'longitude']].values)

train_df['density_exp01'] = kde01e.score_samples(train_df[['latitude', 'longitude']].values)
test_df['density_exp01'] = kde01e.score_samples(test_df[['latitude', 'longitude']].values)

In [21]:
kde01 = KernelDensity(kernel='gaussian', bandwidth=0.02).fit(train_df[['latitude', 'longitude']].values)

train_df['density_gaussian02'] = kde01.score_samples(train_df[['latitude', 'longitude']].values)
test_df['density_gaussian02'] = kde01.score_samples(test_df[['latitude', 'longitude']].values)

In [22]:
train_df.to_pickle('fin-dprep-train.pkl')
test_df.to_pickle('fin-dprep-test.pkl')

In [23]:
full_features = base_features + tfidf_fn + fd_features

In [24]:
pickle.dump(full_features, open('fin-dprep-flist.pkl', 'wb'))