If I had time I probably would have rewritten the mathy features to be a lot faster. I didn't. :)
In [ ]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pickle
import time
import sklearn.cluster
import Levenshtein
from multiprocessing import Pool
import lightgbm as lgbm
In [ ]:
train_df = pd.read_pickle('fin-dprep-train.pkl')
test_df = pd.read_pickle('fin-dprep-test.pkl')
features_to_use = pickle.load(open('fin-dprep-flist.pkl', 'rb'))
#adams = pd.read_pickle('features-adams.pkl')
#train_df = pd.merge(train_df, adams, left_on='listing_id', right_index=True)
#test_df = pd.merge(test_df, adams, left_on='listing_id', right_index=True)
In [ ]:
target_num_map_reg = {'low':0, 'medium': (.5 + (9/13)) / 2, 'high':1}
train_df['interest'] = np.array(train_df['interest_level'].apply(lambda x: target_num_map_reg[x]))
In [ ]:
medium_price = pd.read_pickle('fin-medium-price.pkl')
train_df = pd.merge(train_df, medium_price, left_on='listing_id', right_index=True)
test_df = pd.merge(test_df, medium_price, left_on='listing_id', right_index=True)
In [ ]:
for df in [train_df, test_df]:
df['predicted_price_diff'] = np.log(df.predicted_price) - np.log(df.price)
df['predicted_price_ratio'] = np.log(df.predicted_price) / np.log(df.price)
In [ ]:
# fill in the NaN's.
for t in train_df.keys():
nacount = train_df[t].isnull().sum()
if nacount:
# nacount_test = test_df[t].isnull().sum()
print(t, nacount / len(train_df))#, nacount_test / len(test_df))
train_df.fillna(-99999, inplace=True)
test_df.fillna(-99999, inplace=True)
In [ ]:
class MeansProcessor:
def __init__(self, key, outkey = None, tgt = 'interest'):
self.key = key
self.outkey = key if outkey is None else outkey
self.count = {}
self.means = {}
self.std = {}
self.global_means = {}
self.tgt = tgt
self.outkeys = [self.outkey + '_level', self.outkey + '_level_std']
def fit(self, df):
self.global_means[self.outkey + '_level'] = df[self.tgt].mean()
self.global_means[self.outkey + '_level_std'] = df[self.tgt].std()
for k in df.groupby(self.key, sort=False):
self.count[k[0]] = len(k[1])
if len(k[1]) < 0:
self.means[k[0]] = np.nan
self.std[k[0]] = np.nan
else:
self.means[k[0]] = np.mean(k[1][self.tgt])
self.std[k[0]] = np.std(k[1][self.tgt])
def predict(self, df, nans = False):
for l in self.outkeys:
df[l] = np.nan if nans else self.global_means[l]
df[self.outkey + '_count'] = 0
for k in df.groupby(self.key, sort=False):
if k[0] == 0:
continue
if k[0] in self.means:
df.loc[k[1].index, self.outkey + '_count'] = self.count[k[0]]
df.loc[k[1].index, self.outkey + '_level'] = self.means[k[0]]
df.loc[k[1].index, self.outkey + '_level_std'] = self.std[k[0]]
return df
def get_features(self):
return self.outkeys.copy() + [self.outkey + '_count']
# i kept the same index randomization (with fixed seed) so I could validate this code against
# the original...
target_num_map = {'low':0, 'medium':1, 'high':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
def proc_fold(fold):
train_index = fold[0]
test_index = fold[1]
cv_train = train_df.iloc[train_index]
cv_valid = train_df.iloc[test_index][['interest_level', 'manager_id', 'building_id']]
cv_test = test_df.copy()
m_build = MeansProcessor('building_id', 'building_sort')
m_build.fit(cv_train)
cv_valid = m_build.predict(cv_valid)
cv_test = m_build.predict(cv_test)
m_mgr = MeansProcessor('manager_id', 'manager_sort')
m_mgr.fit(cv_train)
cv_valid = m_mgr.predict(cv_valid)
cv_test = m_mgr.predict(cv_test)
m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')
m_comb.fit(cv_train)
cv_valid = m_comb.predict(cv_valid)
cv_test = m_comb.predict(cv_test)
return cv_train, cv_valid, cv_test
kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_y)]
#with Pool(5) as pool:
# rv = pool.map(proc_fold, folds)
import pickle
try:
rv = pickle.load(open('bag-model-groupfeatures_nonan.pkl', 'rb'))
except:
with Pool(5) as pool:
rv = pool.map(proc_fold, folds)
pickle.dump(rv, open('bag-model-groupfeatures_nonan.pkl', 'wb'))
# dummies to get feature id's
m_build = MeansProcessor('building_id', 'building_sort')
m_mgr = MeansProcessor('manager_id', 'manager_sort')
m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')
group_features = m_build.get_features() + m_mgr.get_features() + m_comb.get_features()
#cv_test = [r[2] for r in rv]
cv_test = []
for r in rv:
cv_test.append(test_df.merge(r[2][group_features], left_index=True, right_index=True))
cv_allvalid = pd.concat([r[1] for r in rv])
train_df = train_df.merge(cv_allvalid[group_features], left_index=True, right_index=True)
new lightgbm tests
In [ ]:
kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_df.interest_cat)]
In [ ]:
# prep CV
cv_train = []
cv_valid = []
for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
cv_train.append(train_df.loc[tr_index])
cv_valid.append(train_df.loc[val_index])
In [ ]:
# features from https://www.kaggle.com/adamsfei/two-sigma-connect-rental-listing-inquiries/only-brand-new-features/notebook
import math
def cart2rho(x, y):
rho = np.sqrt(x**2 + y**2)
return rho
def cart2phi(x, y):
phi = np.arctan2(y, x)
return phi
def rotation_x(row, alpha):
x = row['latitude']
y = row['longitude']
return x*math.cos(alpha) + y*math.sin(alpha)
def rotation_y(row, alpha):
x = row['latitude']
y = row['longitude']
return y*math.cos(alpha) - x*math.sin(alpha)
def add_rotation(degrees, df):
namex = "rot" + str(degrees) + "_X"
namey = "rot" + str(degrees) + "_Y"
df['num_' + namex] = df.apply(lambda row: rotation_x(row, math.pi/(180/degrees)), axis=1)
df['num_' + namey] = df.apply(lambda row: rotation_y(row, math.pi/(180/degrees)), axis=1)
return df
def operate_on_coordinates(tr_df, te_df):
for df in [tr_df, te_df]:
#polar coordinates system
df["num_rho"] = df.apply(lambda x: cart2rho(x["latitude"] - 40.7518, x["longitude"]+73.9779), axis=1)
df["num_phi"] = df.apply(lambda x: cart2phi(x["latitude"] - 40.7518, x["longitude"]+73.9779), axis=1)
#rotations
for angle in [15,30,45,60]:
df = add_rotation(angle, df)
return tr_df, te_df
train_df, test_df = operate_on_coordinates(train_df, test_df)
In [ ]:
import re
def cap_share(x):
return sum(1 for c in x if c.isupper())/float(len(x)+1)
for df in [train_df, test_df]:
# do you think that users might feel annoyed BY A DESCRIPTION THAT IS SHOUTING AT THEM?
df['num_cap_share'] = df['description'].apply(cap_share)
# how long in lines the desc is?
df['num_nr_of_lines'] = df['description'].apply(lambda x: x.count('<br /><br />'))
# is the description redacted by the website?
df['num_redacted'] = 0
df['num_redacted'].ix[df['description'].str.contains('website_redacted')] = 1
# can we contact someone via e-mail to ask for the details?
df['num_email'] = 0
df['num_email'].ix[df['description'].str.contains('@')] = 1
#and... can we call them?
reg = re.compile(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", re.S)
def try_and_find_nr(description):
if reg.match(description) is None:
return 0
return 1
df['num_phone_nr'] = df['description'].apply(try_and_find_nr)
In [ ]:
newfeatures = []
for f in train_df.keys():
#print(f)
if 'rot' in f:
newfeatures.append(f)
newfeatures.append('num_rho')
newfeatures.append('num_phi')
newfeatures.append('num_cap_share')
newfeatures.append('num_nr_of_lines')
newfeatures.append('num_redacted')
newfeatures.append('num_email')
newfeatures.append('num_phone_nr')
#for f in range(5):
# cv_test[f][newfeatures] = test_df[newfeatures].copy()
In [ ]:
nf_train = train_df[newfeatures + ['listing_id']].copy()
nf_test = test_df[newfeatures + ['listing_id']].copy()
adams = pd.concat([nf_train, nf_test])
adams.set_index('listing_id', inplace=True)
In [ ]:
adams.to_pickle('features-adams.pkl')
In [ ]: