Renthop Notebook

Two Sigma Connect Renthop Competition


Info

Author: Theo Naunheim

Date: 2017-04-22

License: MIT license

Link: https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries


"Strategy"

  • Create feature probabilities based on 'features' text and 'description' text.

  • Create feature probabilities based on longitude and latitude DBSCAN clusters.

  • Create features based on perceived value.

  • Throw all the features at a boosted gradient classfier and let god sort them out.


Inputs

  1. train.json (49,352 samples, 15 features)
  2. test.json (74,659 samples, 14 features)

Outputs

  1. renthop_predictions.csv (74,659 samples, 4 features)


In [1]:
import numpy as np
import pandas as pd

from sklearn.cluster import DBSCAN

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KDTree

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import LabelEncoder

In [2]:
# Original matrix
odf = pd.read_json('./train.json').set_index('listing_id')
oy = odf['interest_level']
oX = odf.drop('interest_level', axis=1)

# Test matrix
tX = pd.read_json('./test.json').set_index('listing_id')

# Original derived matrix
odX = pd.DataFrame(index=oX.index)

# Test-derived matrix
tdX = pd.DataFrame(index=tX.index)

# All matrices.
all_X = [oy, oX, odX, tX, tdX]

In [3]:
def create_features_probabilities(oy, oX, odX, tX, tdX):
    '''Create probabilities for 'feature' column.'''

    # Make text vectors
    o_feat = oX['features'].map(lambda text_list: ' '.join(text_list))
    t_feat = tX['features'].map(lambda text_list: ' '.join(text_list))

    # Gridsearch Params (TODO, add more)
    parameters = {'vec__ngram_range': [(1, 1)],
                  'vec__min_df': [10, 15],
                  'trans__use_idf': [True]}

    # Create pipeline
    pipeline = Pipeline([('vec', CountVectorizer()),
                         ('trans', TfidfTransformer()),
                         ('clf', ExtraTreesClassifier())])

    # Gridsearch
    feat_clf = GridSearchCV(pipeline, parameters, n_jobs=3)

    # Fit
    feat_clf.fit(o_feat.values, oy.values)

    # List classes and create friendly labels
    feat_classes = feat_clf.best_estimator_.classes_
    feat_headers = np.add(['text_feat_'], [feat_classes]).flatten()


    # Get proba and add to original derived dataframe
    tdf1 = pd.DataFrame.from_records(feat_clf.predict_log_proba(o_feat),
                                     columns=feat_headers,
                                     index=oX.index)
    odX = pd.concat([odX, tdf1], axis=1)
    
    # Get proba and add to test derived dataframe.
    tdf2 = pd.DataFrame.from_records(feat_clf.predict_log_proba(t_feat),
                                     columns=feat_headers,
                                     index=tX.index)
    tdX = pd.concat([tdX, tdf2], axis=1)
    
    return [oy, oX, odX, tX, tdX]


all_X = create_features_probabilities(*all_X)


C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:628: RuntimeWarning: divide by zero encountered in log
  return np.log(proba)

In [4]:
all_X[4].head()


Out[4]:
text_feat_high text_feat_low text_feat_medium
listing_id
7142618 -inf 0.000000 -inf
7210040 -3.954089 -0.096874 -2.615197
7103890 -inf -0.310155 -1.321756
7143442 -inf -0.182322 -1.791759
6860601 -inf 0.000000 -inf

In [5]:
def create_description_probabilities(oy, oX, odX, tX, tdX):
    '''Create probabilities for 'feature' column.'''

    # Make text vectors
    o_desc = oX['description']
    t_desc = tX['description']

    # Gridsearch Params
    parameters = {'vec__ngram_range': [(1, 3)],
                  'vec__min_df': [10],
                  'trans__use_idf': [True]}

    # Create pipeline
    pipeline = Pipeline([('vec', CountVectorizer()),
                         ('trans', TfidfTransformer()),
                         ('clf', ExtraTreesClassifier())])

    # Gridsearch
    desc_clf = GridSearchCV(pipeline, parameters, n_jobs=3)

    # Fit
    desc_clf.fit(o_desc.values, oy.values)

    # List classes and create friendly labels
    desc_classes = desc_clf.best_estimator_.classes_
    desc_headers = np.add(['text_desc_'], [desc_classes]).flatten()


    # Get proba and add to original derived dataframe -> temp df 1
    tdf1 = pd.DataFrame.from_records(desc_clf.predict_log_proba(o_desc),
                                     columns=desc_headers,
                                     index=oX.index)
    odX = pd.concat([odX, tdf1], axis=1)
    
    # Get proba and add to test derived dataframe -> temp df 2
    tdf2 = pd.DataFrame.from_records(desc_clf.predict_log_proba(t_desc),
                                     columns=desc_headers,
                                     index=tX.index)
    tdX = pd.concat([tdX, tdf2], axis=1)
    
    return [oy, oX, odX, tX, tdX]


all_X = create_description_probabilities(*all_X)


C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:628: RuntimeWarning: divide by zero encountered in log
  return np.log(proba)

In [6]:
all_X[4].head()


Out[6]:
text_feat_high text_feat_low text_feat_medium text_desc_high text_desc_low text_desc_medium
listing_id
7142618 -inf 0.000000 -inf -inf -0.356675 -1.203973
7210040 -3.954089 -0.096874 -2.615197 -1.203973 -0.510826 -2.302585
7103890 -inf -0.310155 -1.321756 -inf -2.302585 -0.105361
7143442 -inf -0.182322 -1.791759 -1.696449 -0.836248 -0.958850
6860601 -inf 0.000000 -inf -inf 0.000000 -inf

In [7]:
def create_dbscan_df(oy, oX, odX, tX, tdX):
    '''Create DBSCAN lookup database with probabilities.'''
    # Get locations
    locations = oX[['longitude', 'latitude']]

    # First cluster using DBSCAN
    cl_clf = DBSCAN(eps=.001, min_samples=10, n_jobs=3)
    cluster_designations = cl_clf.fit_predict(locations.values)
    
    # Create an dataframe with temp info
    tdf = pd.DataFrame({'interest_level': oy,
                        'cluster': cluster_designations,
                        'longitude': oX['longitude'],
                        'latitude': oX['latitude']})
    
    # Calculate cluster centers
    centers = tdf.groupby('cluster')[['longitude', 'latitude']].mean()

    # Break out clusters by interest level
    counts = tdf.groupby(['cluster', 'interest_level']).size()
    count_df = counts.unstack().fillna(0)
    proba_df = count_df.apply(lambda x: np.log(x/sum(x)), axis=1)

    # Concat into df
    dbscan_df = pd.concat([proba_df, centers], axis=1)
    dbscan_df = dbscan_df.add_prefix('dbscan_avg_')
    
    # Create kd tree for faster indexing
    kd_tree = KDTree(centers)
    
    return (dbscan_df, kd_tree)


dbscan_df, kd_tree = create_dbscan_df(*all_X)


C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:22: RuntimeWarning: divide by zero encountered in log

In [8]:
all_X[4].head()


Out[8]:
text_feat_high text_feat_low text_feat_medium text_desc_high text_desc_low text_desc_medium
listing_id
7142618 -inf 0.000000 -inf -inf -0.356675 -1.203973
7210040 -3.954089 -0.096874 -2.615197 -1.203973 -0.510826 -2.302585
7103890 -inf -0.310155 -1.321756 -inf -2.302585 -0.105361
7143442 -inf -0.182322 -1.791759 -1.696449 -0.836248 -0.958850
6860601 -inf 0.000000 -inf -inf 0.000000 -inf

In [9]:
def create_dbscan_probabilities(oy, oX, odX, tX, tdX, dbscan_df, kd_tree):
    '''Create probabilities for both derived dataframes.'''
    
    # Get nearest cluster from indices.
    train_indices = kd_tree.query(oX[['longitude', 'latitude']],
                                  return_distance=False,
                                  dualtree=True).flatten()

    # Merge indices with index
    train_cluster_df = pd.DataFrame({'cluster': train_indices}, index=oX.index)

    # Join df to eliminate cluster and add probs. 
    train_dbscan_probs = train_cluster_df.join(dbscan_df,
                                               how='left',
                                               on='cluster')[['dbscan_avg_high',
                                                              'dbscan_avg_low',
                                                              'dbscan_avg_medium']]

    # Append probabilities to original derived database for train
    odX = pd.concat([odX, train_dbscan_probs], axis=1)
    
    # Get the index for the closest cluster center for test.
    test_indices = kd_tree.query(tX[['longitude', 'latitude']],
                                 return_distance=False,
                                 dualtree=True).flatten()
    
    # Merge indices with index
    test_cluster_df = pd.DataFrame({'cluster': test_indices}, index=tX.index)

    # Join df to eliminate cluster and add probs. 
    test_dbscan_probs = test_cluster_df.join(dbscan_df,
                                             how='left',
                                             on='cluster')[['dbscan_avg_high',
                                                            'dbscan_avg_low',
                                                            'dbscan_avg_medium']]

    # Append probabilities to original derived database
    tdX = pd.concat([tdX, test_dbscan_probs], axis=1)

    return [oy, oX, odX, tX, tdX]


all_X = create_dbscan_probabilities(*all_X, dbscan_df, kd_tree)

In [10]:
all_X[4].head()


Out[10]:
text_feat_high text_feat_low text_feat_medium text_desc_high text_desc_low text_desc_medium dbscan_avg_high dbscan_avg_low dbscan_avg_medium
listing_id
7142618 -inf 0.000000 -inf -inf -0.356675 -1.203973 -2.564949 -0.282567 -1.776492
7210040 -3.954089 -0.096874 -2.615197 -1.203973 -0.510826 -2.302585 -2.602690 -0.730888 -0.810930
7103890 -inf -0.310155 -1.321756 -inf -2.302585 -0.105361 -inf -0.143101 -2.014903
7143442 -inf -0.182322 -1.791759 -1.696449 -0.836248 -0.958850 -inf -0.271934 -1.435085
6860601 -inf 0.000000 -inf -inf 0.000000 -inf -inf -0.318454 -1.299283

In [11]:
def add_room_pricing_features(oy, oX, odX, tX, tdX):
    '''Add value-ish features to dataframe.'''
    # Create features
    train_pricing_df = pd.DataFrame({'dollars_per_bedroom': oX['bedrooms']/oX['price'],
                                     'dollars_per_bathroom': oX['bathrooms']/ oX['price'],
                                     'bathroom_percentage': oX['bathrooms'] / (oX['bedrooms'] + oX['bathrooms']),
                                     'price': oX['price'],
                                     'bathrooms': oX['bathrooms'],
                                     'bedrooms': oX['bedrooms']},
                                     index=oX.index)
    # Percentiles 
    train_pricing_df = (train_pricing_df.rank(axis=1, pct=True, method='dense')
                                        .applymap(lambda x: np.log(x)))

    # Append to original original derived dataframe
    odX = pd.concat([odX, train_pricing_df], axis=1)
    
    # Create features
    test_pricing_df = pd.DataFrame({'dollars_per_bedroom': tX['bedrooms']/ tX['price'],
                                    'dollars_per_bathroom': tX['bathrooms']/ tX['price'],
                                    'bathroom_percentage': tX['bathrooms'] / (tX['bedrooms'] + tX['bathrooms']),
                                    'price': tX['price'],
                                    'bathrooms': tX['bathrooms'],
                                    'bedrooms': tX['bedrooms']},
                                    index=tX.index)
    # Percentiles
    test_pricing_df = (test_pricing_df.rank(axis=1, pct=True, method='dense')
                                      .applymap(lambda x: np.log(x)))

    # Append to original original derived dataframe
    tdX = pd.concat([tdX, test_pricing_df], axis=1)
    
    return [oy, oX, odX, tX, tdX]


all_X = add_room_pricing_features(*all_X)

In [12]:
all_X[4].head()
cp_odX = all_X[2].copy()
cp_tdX = all_X[4].copy()

In [17]:
def create_boosted_clf(oy, oX, odX, tX, tdX):
    '''Create gbc, train gbc, cross validate, and return gbc.'''
    # Cast negative infinity to something model can work with
    odX = odX.replace(-np.inf, np.float32(9.9e-25))
    tdX = tdX.replace(-np.inf, np.float32(9.9e-25))
    # Fill na values with whatever.
    odX = odX.fillna(odX.mean())
    tdX = tdX.fillna(tdX.mean())
    le = LabelEncoder().fit(oy)
    # Grid search
    params = {#'loss': ['deviance', 'exponential'],
              # 'learning_rate': [1, 0.1, .01],
              # 'n_estimators': [100, 250],
              'subsample': [1.0, .5], 
              #'min_samples_split': [2, 5],
              #'min_samples_leaf': [1, 2, 5],
              'min_weight_fraction_leaf': [0.0, .2],
              #'max_depth': [1, 3, 5],
              'min_impurity_split': [1e-05, 1e-09]}
    gbc = GridSearchCV(GradientBoostingClassifier(),
                       params,
                       scoring='neg_log_loss',
                       n_jobs=3)
    gbc.fit(odX, le.transform(oy))
    # Score
    print(cross_val_score(gbc,
                          odX,
                          le.transform(oy),
                          scoring='accuracy'))
    
    return ([oy, oX, odX, tX, tdX], gbc, le)

    
all_X, gbc, le = create_boosted_clf(*all_X)


[ 0.72633882  0.72500152  0.72541033]

In [14]:
gbc


Out[14]:
GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=3,
       param_grid={'subsample': [1.0, 0.5], 'min_weight_fraction_leaf': [0.0, 0.2], 'min_impurity_split': [1e-05, 1e-09]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_log_loss', verbose=0)

In [16]:
def output_predictions(oy, oX, odX, tX, tdX, gbc, le):
    '''Create the actual predictions and write to csv.'''
    # Run predictions and write to csv
    predictions = pd.DataFrame(gbc.predict_proba(tdX.values),
                               columns=le.classes_,
                               index=tdX.index)
    predictions = predictions[['high', 'medium', 'low']]
    predictions.to_csv('renthop_predictions.csv',
                       index_label='listing_id')
    return predictions


predictions = output_predictions(*all_X, gbc, le)
predictions.head(5)


Out[16]:
high medium low
listing_id
7142618 0.010191 0.178271 0.811537
7210040 0.127037 0.103861 0.769102
7103890 0.002051 0.487031 0.510918
7143442 0.008680 0.445127 0.546193
6860601 0.035208 0.137436 0.827356