If I had time I probably would have rewritten the mathy features to be a lot faster. I didn't. :)

import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import pickle
import time

import sklearn.cluster

import Levenshtein

from multiprocessing import Pool

import lightgbm as lgbm

train_df = pd.read_pickle('fin-dprep-train.pkl')
test_df = pd.read_pickle('fin-dprep-test.pkl')
features_to_use = pickle.load(open('fin-dprep-flist.pkl', 'rb'))

#adams = pd.read_pickle('features-adams.pkl')

#train_df = pd.merge(train_df, adams, left_on='listing_id', right_index=True)
#test_df = pd.merge(test_df, adams, left_on='listing_id', right_index=True)

target_num_map_reg = {'low':0, 'medium': (.5 + (9/13)) / 2, 'high':1}
train_df['interest'] = np.array(train_df['interest_level'].apply(lambda x: target_num_map_reg[x]))

medium_price = pd.read_pickle('fin-medium-price.pkl')

train_df = pd.merge(train_df, medium_price, left_on='listing_id', right_index=True)
test_df = pd.merge(test_df, medium_price, left_on='listing_id', right_index=True)

for df in [train_df, test_df]:
    df['predicted_price_diff'] = np.log(df.predicted_price) - np.log(df.price)
    df['predicted_price_ratio'] = np.log(df.predicted_price) / np.log(df.price)

# fill in the NaN's.

for t in train_df.keys():
    nacount = train_df[t].isnull().sum()
    if nacount:
#        nacount_test = test_df[t].isnull().sum()
        print(t, nacount / len(train_df))#, nacount_test / len(test_df))
train_df.fillna(-99999, inplace=True)
test_df.fillna(-99999, inplace=True)

class MeansProcessor:
    def __init__(self, key, outkey = None, tgt = 'interest'):
        self.key = key
        self.outkey = key if outkey is None else outkey
        self.count = {}
        self.means = {}
        self.std = {}
        self.global_means = {}
        self.tgt = tgt
        self.outkeys = [self.outkey + '_level', self.outkey + '_level_std']
    def fit(self, df):
        self.global_means[self.outkey + '_level'] = df[self.tgt].mean()
        self.global_means[self.outkey + '_level_std'] = df[self.tgt].std()
        for k in df.groupby(self.key, sort=False):
            self.count[k[0]] = len(k[1])

            if len(k[1]) < 0:
                self.means[k[0]] = np.nan
                self.std[k[0]] = np.nan
                self.means[k[0]] = np.mean(k[1][self.tgt])
                self.std[k[0]] = np.std(k[1][self.tgt])
    def predict(self, df, nans = False):
        for l in self.outkeys:
            df[l] = np.nan if nans else self.global_means[l]
        df[self.outkey + '_count'] = 0
        for k in df.groupby(self.key, sort=False):
            if k[0] == 0:
            if k[0] in self.means:
                df.loc[k[1].index, self.outkey + '_count'] = self.count[k[0]]
                df.loc[k[1].index, self.outkey + '_level'] = self.means[k[0]]
                df.loc[k[1].index, self.outkey + '_level_std'] = self.std[k[0]]
        return df
    def get_features(self):
        return self.outkeys.copy() + [self.outkey + '_count']

# i kept the same index randomization (with fixed seed) so I could validate this code against
# the original...

target_num_map = {'low':0, 'medium':1, 'high':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

def proc_fold(fold):
    train_index = fold[0]
    test_index = fold[1]
    cv_train = train_df.iloc[train_index]
    cv_valid = train_df.iloc[test_index][['interest_level', 'manager_id', 'building_id']]
    cv_test = test_df.copy()
    m_build = MeansProcessor('building_id', 'building_sort')
    cv_valid = m_build.predict(cv_valid)
    cv_test = m_build.predict(cv_test)

    m_mgr = MeansProcessor('manager_id', 'manager_sort')
    cv_valid = m_mgr.predict(cv_valid)
    cv_test = m_mgr.predict(cv_test)

    m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')
    cv_valid = m_comb.predict(cv_valid)
    cv_test = m_comb.predict(cv_test)

    return cv_train, cv_valid, cv_test

kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_y)]

#with Pool(5) as pool:
#    rv =, folds)

import pickle

    rv = pickle.load(open('bag-model-groupfeatures_nonan.pkl', 'rb'))
    with Pool(5) as pool:
        rv =, folds)

        pickle.dump(rv, open('bag-model-groupfeatures_nonan.pkl', 'wb'))

# dummies to get feature id's
m_build = MeansProcessor('building_id', 'building_sort')
m_mgr = MeansProcessor('manager_id', 'manager_sort')
m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')

group_features = m_build.get_features() + m_mgr.get_features() + m_comb.get_features()

#cv_test = [r[2] for r in rv]
cv_test = []
for r in rv:
    cv_test.append(test_df.merge(r[2][group_features], left_index=True, right_index=True))

cv_allvalid = pd.concat([r[1] for r in rv])

train_df = train_df.merge(cv_allvalid[group_features], left_index=True, right_index=True)

new lightgbm tests

kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_df.interest_cat)]

# prep CV

cv_train = []
cv_valid = []

for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):

# features from

import math
def cart2rho(x, y):
    rho = np.sqrt(x**2 + y**2)
    return rho

def cart2phi(x, y):
    phi = np.arctan2(y, x)
    return phi

def rotation_x(row, alpha):
    x = row['latitude']
    y = row['longitude']
    return x*math.cos(alpha) + y*math.sin(alpha)

def rotation_y(row, alpha):
    x = row['latitude']
    y = row['longitude']
    return y*math.cos(alpha) - x*math.sin(alpha)

def add_rotation(degrees, df):
    namex = "rot" + str(degrees) + "_X"
    namey = "rot" + str(degrees) + "_Y"

    df['num_' + namex] = df.apply(lambda row: rotation_x(row, math.pi/(180/degrees)), axis=1)
    df['num_' + namey] = df.apply(lambda row: rotation_y(row, math.pi/(180/degrees)), axis=1)

    return df

def operate_on_coordinates(tr_df, te_df):
    for df in [tr_df, te_df]:
        #polar coordinates system
        df["num_rho"] = df.apply(lambda x: cart2rho(x["latitude"] - 40.7518, x["longitude"]+73.9779), axis=1)
        df["num_phi"] = df.apply(lambda x: cart2phi(x["latitude"] - 40.7518, x["longitude"]+73.9779), axis=1)
        for angle in [15,30,45,60]:
            df = add_rotation(angle, df)

    return tr_df, te_df

train_df, test_df = operate_on_coordinates(train_df, test_df)

import re

def cap_share(x):
    return sum(1 for c in x if c.isupper())/float(len(x)+1)

for df in [train_df, test_df]:
    # do you think that users might feel annoyed BY A DESCRIPTION THAT IS SHOUTING AT THEM?
    df['num_cap_share'] = df['description'].apply(cap_share)
    # how long in lines the desc is?
    df['num_nr_of_lines'] = df['description'].apply(lambda x: x.count('<br /><br />'))
    # is the description redacted by the website?        
    df['num_redacted'] = 0
    df['num_redacted'].ix[df['description'].str.contains('website_redacted')] = 1

    # can we contact someone via e-mail to ask for the details?
    df['num_email'] = 0
    df['num_email'].ix[df['description'].str.contains('@')] = 1
    #and... can we call them?
    reg = re.compile(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", re.S)
    def try_and_find_nr(description):
        if reg.match(description) is None:
            return 0
        return 1

    df['num_phone_nr'] = df['description'].apply(try_and_find_nr)

newfeatures = []
for f in train_df.keys():
    if 'rot' in f:


#for f in range(5):
#    cv_test[f][newfeatures] = test_df[newfeatures].copy()

nf_train = train_df[newfeatures + ['listing_id']].copy()
nf_test = test_df[newfeatures + ['listing_id']].copy()
adams = pd.concat([nf_train, nf_test])
adams.set_index('listing_id', inplace=True)

