In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pickle
import sklearn.cluster
import Levenshtein
from multiprocessing import Pool
In [2]:
# lightgbm can't handle NaN's, but with pd.fillna it's not *this* program's problem
MISSING = np.nan
# MISSING = -99999
In [3]:
#input data
train_df=pd.read_json('../input/train.json')
test_df=pd.read_json('../input/test.json')
train_df.reset_index(inplace=True)
In [4]:
# determined to be broken after looking at results from a prior run of a submodel. 1025 is in it's Description.
train_df.loc[train_df.listing_id == 7122037, 'price'] = 1025
In [5]:
target_num_map = {'low':0, 'medium':1, 'high':2}
train_df['interest_cat'] = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
target_num_map_reg = {'low':0, 'medium': (.5 + (9/13)) / 2.0, 'high':1}
train_df['interest'] = np.array(train_df['interest_level'].apply(lambda x: target_num_map_reg[x]))
In [6]:
from sklearn.cluster import Birch
n_clusters = 92
# modified from https://www.kaggle.com/luisblanche/two-sigma-connect-rental-listing-inquiries/price-compared-to-neighborhood-median/run/1011514
def latlong_in_city(data):
return (data.longitude>-74.05)&(data.longitude<-73.75)&(data.latitude>40.4)&(data.latitude<40.9)
data_c=train_df[latlong_in_city(train_df)].copy()
data_e=train_df[~latlong_in_city(train_df)].copy()
coords_c=data_c.as_matrix(columns=['latitude', "longitude"])
brc = Birch(branching_factor=100, n_clusters=n_clusters, threshold=0.01,compute_labels=True)
brc.fit(coords_c)
coords_tr=train_df.as_matrix(columns=['latitude', "longitude"])
coords_te=test_df.as_matrix(columns=['latitude', "longitude"])
train_df['location_cluster'] = brc.predict(coords_tr)
test_df['location_cluster'] = brc.predict(coords_te)
train_df.loc[~latlong_in_city(train_df), 'location_cluster'] = -1
test_df.loc[~latlong_in_city(test_df), 'location_cluster'] = -1
In [7]:
imean = train_df.interest.mean()
In [8]:
chisum = 0
for key in ['location_cluster']:
for g in train_df.groupby(key):
if len(g[1]) > 20:
chi = ((g[1].interest.mean() - imean) ** 2.0) * len(g[1])
chisum += chi
#print(g[0], len(g[1]), chi, g[1].interest.mean())
print(chisum)
In [9]:
# https://www.kaggle.com/ivanoliveri/two-sigma-connect-rental-listing-inquiries/new-features-from-addresses-fields
def get_leven_ratio_row(row):
return Levenshtein.ratio(row.display_address.lower(), row.street_address.lower())
def get_leven_ratio(df):
return df.apply(get_leven_ratio_row, axis=1)
with Pool(2) as pool:
rv = pool.map(get_leven_ratio, [train_df, test_df])
train_df['address_ratio'] = rv[0]
test_df['address_ratio'] = rv[1]
In [10]:
# The infamous leak
image_time = pd.read_csv('../input/listing_image_time.csv')
train_df = pd.merge(train_df, image_time, left_on='listing_id', right_on='Listing_Id')
test_df = pd.merge(test_df, image_time, left_on='listing_id', right_on='Listing_Id')
In [11]:
# mostly from a few different public scripts
def preproc_df(train_df):
#train_df["price"] = train_df["price"].clip(upper=13000)
train_df["price_t"] =train_df["price"]/train_df["bedrooms"]
train_df["room_sum"] = train_df["bedrooms"]+train_df["bathrooms"]
train_df['price_per_room'] = train_df['price']/train_df['room_sum']
train_df['half_bathroom'] = train_df['bathrooms'] - np.floor(train_df['bathrooms'])
train_df["num_photos"] = train_df["photos"].apply(len)
train_df["num_features"] = train_df["features"].apply(len)
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
train_df["description_length"] = train_df["description"].apply(lambda x: len(x))
train_df["created"] = pd.to_datetime(train_df["created"])
train_df["created_year"] = train_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
train_df["created_dayofyear"] = train_df["created"].dt.dayofyear
train_df["created_hour"] = train_df["created"].dt.hour
train_df["created_epoch"] = pd.DatetimeIndex(train_df.created).astype(np.int64) // 1000000000
# train_df["listing_div_day"] = train_df["created"].dt.hour
train_df["pos"] = train_df.longitude.round(3).astype(str) + '_' + train_df.latitude.round(3).astype(str)
#train_df["pos2"] = train_df.longitude.round(3).astype(str) + '_' + train_df.latitude.round(3).astype(str)
return train_df
train_df = preproc_df(train_df)
test_df = preproc_df(test_df)
vals = train_df['pos'].value_counts()
dvals = vals.to_dict()
train_df["density"] = train_df['pos'].apply(lambda x: dvals.get(x, vals.min()))
test_df["density"] = test_df['pos'].apply(lambda x: dvals.get(x, vals.min()))
In [12]:
base_features = ["address_ratio", "location_cluster","bathrooms", "bedrooms", "half_bathroom",
"latitude", "longitude", "price","price_t","price_per_room", "density",
"num_photos", "num_features", "num_description_words","listing_id",
"created_year", "created_dayofyear", "created_month", "created_day", "created_hour", 'time_stamp']
Improved version of TF-IDF processing from https://www.kaggle.com/sudalairajkumar/two-sigma-connect-rental-listing-inquiries/xgb-starter-in-python
My version does more precleaning and results in improved output quality.
In [13]:
# correct punctuation etc for features/prep for TF-IDFing
def featurefixer(l):
rv = ''
for f in l:
s = f.lower()
s = s.replace(' ', '_')
s = s.replace('-', '_')
s = s.replace('/', '_')
s = s.replace('*', ' ')
rv += s + ' '
return rv
train_df['features'] = train_df["features"].apply(featurefixer)
test_df['features'] = test_df["features"].apply(featurefixer)
In [14]:
print(train_df["features"].head())
tfidf = CountVectorizer(stop_words='english', max_features=200)
tfidf.fit(train_df["features"])
#te_sparse = tfidf.transform(test_df["features"])
Out[14]:
In [15]:
tfidf_train = tfidf.transform(train_df['features']).toarray()
tfidf_test = tfidf.transform(test_df['features']).toarray()
tfidf_fn = ['f_{0}'.format(f) for f in tfidf.get_feature_names()]
pd_tfidf_train = pd.DataFrame(tfidf_train, columns=tfidf_fn, index=train_df.index)
pd_tfidf_test = pd.DataFrame(tfidf_test, columns=tfidf_fn, index=test_df.index)
# this was filtered out by chi squared measurements elsewhere... which might or might not be a great idea. eh.
mergelist = [
# [('f_central_ac', 'f_central_air', 'f_central_a_c'), 'f_central_ac'],
# [('f_hi_rise', 'f_highrise'), 'fm_highrise'],
[('f_ft_doorman', 'f_full_time_doorman',), 'f_doorman'],
# [('f_washer_in_unit', 'f_dryer_in_unit', 'f_washer_dryer_in_unit', 'f_in_unit_washer_dryer'), 'fm_laundry_in_unit'],
# [('f_concierge', 'f_concierge_service', 'f_24_7_concierge'), 'fm_concierge'],
# [('f_roofdeck', 'f_roof_deck', 'f_rooftop_deck'), 'fm_roofdeck'],
[('f_laundry_',), 'f_laundry'],
# [('f_washer_dryer_in_building', 'f_private_laundry_room_on_every_floor', 'f_on_site_laundry', 'f_laundry_room', 'f_laundry_in_building'), 'fm_laundry_in_building'],
# [('f_live_in_super',), 'f_live_in_superintendent'],
# [('f_prewar','f__ornate_prewar_details_'), 'f_pre_war'],
[('f_valet_services_including_dry_cleaning', 'f_valet_services'), 'f_valet'],
[('f_wheelchair_ramp',), 'f_wheelchair_access'],
[('f_high_ceiling',), 'f_high_ceilings'],
[('f_terraces___balconies',), 'f_terrace'],
[('f__dishwasher_',), 'f_dishwasher'],
[('f_decorative_fireplace', 'f_fireplaces'), 'f_fireplace'],
# [('f_common_parking_garage', 'f_full_service_garage', 'f_garage', 'f_on_site_garage'), 'fm_garage'],
[('f_private_outdoor_space', 'f_common_outdoor_space'), 'f_outdoor_space'],
[('f__elev_bldg_',), 'f_elevator']
]
def run_mergelist(df, mergelist):
for m in mergelist:
#print(m, m[1])
if m[1] not in df:
df[m[1]] = 0
for merge in m[0]:
#print('X ', merge, m[1])
df[m[1]] |= df[merge]
df.drop(merge, axis=1, inplace=True)
return df
pd_tfidf_train = run_mergelist(pd_tfidf_train, mergelist)
pd_tfidf_test = run_mergelist(pd_tfidf_test, mergelist)
tfidf_fn = list(pd_tfidf_train.keys())
train_df = pd.merge(train_df, pd_tfidf_train, left_index=True, right_index=True)
test_df = pd.merge(test_df, pd_tfidf_test, left_index=True, right_index=True)
train_df['d_lower'] = train_df.description.apply(lambda s: s.lower())
test_df['d_lower'] = test_df.description.apply(lambda s: s.lower())
In [16]:
def printchi(df, key):
imean = df.interest.mean()
for k in key:
subset = df[df[k] > 0]
chi = ((subset.interest.mean() - imean) ** 2.0) * len(subset)
print(k, len(subset), chi)
descmap = [
[('exposed brick',), 'fd_exposed_brick'],
[('fireplace',), 'fd_fireplace'],
#[('doorman',), 'f_doorman'],
[('microwave',), 'fd_microwave'],
[('laundry in unit', 'washer dryer inside',), 'fd_laundry_in_unit'],
[('dishwasher',), 'fd_dishwasher'],
[('no fee',), 'fd_no_fee'],
[('subway',), 'fd_subway'],
]
fd_features = [m[1] for m in descmap]
def backfill(df):
for m in descmap:
df[m[1]] = 0
for keyword in m[0]:
nv = df.d_lower.apply(lambda x: x.find(keyword) >= 0)
df[m[1]] |= nv
df[m[1]] = df[m[1]].astype(np.uint8)
backfill(train_df)
backfill(test_df)
In [17]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
if train_df[f].dtype=='object':
#print(f)
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train_df[f].values) + list(test_df[f].values))
train_df[f] = lbl.transform(list(train_df[f].values))
test_df[f] = lbl.transform(list(test_df[f].values))
base_features.append(f)
In [18]:
# primitive version of bedroom/location price grouper (-submodel-medium does much more work than this!)
train_high = train_df[train_df.interest_level == 'medium']
price_group = {}
for g in train_high.groupby(['bedrooms', 'location_cluster']):
if len(g[1]) < 10:
continue
#print(g[0], g[1].price.mean())
price_group[g[0]] = g[1].price.mean()
def apply_group(df):
df['price_group'] = MISSING
for g in df.groupby(['bedrooms', 'location_cluster']):
if g[0] in price_group:
df.loc[g[1].index, 'price_group'] = price_group[g[0]]
df['price_ratio'] = df['price'] / df['price_group']
return df
train_df = apply_group(train_df)
test_df = apply_group(test_df)
In [19]:
# non-baynesian manager features. this tries to pick out 'lazy' managers that don't write enough
# descriptions and don't assign building ID's properly, etc.
def apply_avg(df, means, key, clip = None):
df[key] = MISSING # can't use NaN's with lightgbm yet
for m in means:
if clip is not None:
v = m[1] > clip
df.loc[df['manager_id'] == m[0], key] = m[1]
means_dl = []
means_b0 = []
means_f0 = []
median_price = []
for m in train_df[['manager_id', 'description_length', 'building_id', 'num_features', 'price']].groupby('manager_id'):
if len(m[1]) < 5:
continue
means_dl.append((m[0], (m[1].description_length <= 8).mean()))
means_b0.append((m[0], (m[1].building_id == 0).mean()))
means_f0.append((m[0], (m[1].num_features == 0).mean()))
median_price.append((m[0], m[1].price.median()))
for df in [train_df, test_df]:
apply_avg(df, means_dl, 'manager_shortdesc_rate', 0.25)
apply_avg(df, means_b0, 'manager_building0_rate', 0.77)
apply_avg(df, means_f0, 'manager_0feature_rate')
apply_avg(df, median_price, 'manager_median_price')
df['manager_lazy_rate'] = np.clip(df.manager_shortdesc_rate + df.manager_building0_rate, 0, 1)
I did a few different kernel density functions, and settled on these two as most effective
In [20]:
from sklearn.neighbors.kde import KernelDensity
kde01e = KernelDensity(kernel='exponential', bandwidth=0.01).fit(train_df[['latitude', 'longitude']].values)
train_df['density_exp01'] = kde01e.score_samples(train_df[['latitude', 'longitude']].values)
test_df['density_exp01'] = kde01e.score_samples(test_df[['latitude', 'longitude']].values)
In [21]:
kde01 = KernelDensity(kernel='gaussian', bandwidth=0.02).fit(train_df[['latitude', 'longitude']].values)
train_df['density_gaussian02'] = kde01.score_samples(train_df[['latitude', 'longitude']].values)
test_df['density_gaussian02'] = kde01.score_samples(test_df[['latitude', 'longitude']].values)
In [22]:
train_df.to_pickle('fin-dprep-train.pkl')
test_df.to_pickle('fin-dprep-test.pkl')
In [23]:
full_features = base_features + tfidf_fn + fd_features
In [24]:
pickle.dump(full_features, open('fin-dprep-flist.pkl', 'wb'))