Renthop Notebook

Two Sigma Connect Renthop Competition

Info

Author: Theo Naunheim

Date: 2017-04-22

License: MIT license

Link: https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries

"Strategy"

Create feature probabilities based on 'features' text and 'description' text.
Create feature probabilities based on longitude and latitude DBSCAN clusters.
Create features based on perceived value.
Throw all the features at a boosted gradient classfier and let god sort them out.

Inputs

train.json (49,352 samples, 15 features)
test.json (74,659 samples, 14 features)

Outputs

renthop_predictions.csv (74,659 samples, 4 features)



In [1]:

    
import numpy as np
import pandas as pd

from sklearn.cluster import DBSCAN

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KDTree

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import LabelEncoder



In [2]:

    
# Original matrix
odf = pd.read_json('./train.json').set_index('listing_id')
oy = odf['interest_level']
oX = odf.drop('interest_level', axis=1)

# Test matrix
tX = pd.read_json('./test.json').set_index('listing_id')

# Original derived matrix
odX = pd.DataFrame(index=oX.index)

# Test-derived matrix
tdX = pd.DataFrame(index=tX.index)

# All matrices.
all_X = [oy, oX, odX, tX, tdX]



In [3]:

    
def create_features_probabilities(oy, oX, odX, tX, tdX):
    '''Create probabilities for 'feature' column.'''

    # Make text vectors
    o_feat = oX['features'].map(lambda text_list: ' '.join(text_list))
    t_feat = tX['features'].map(lambda text_list: ' '.join(text_list))

    # Gridsearch Params (TODO, add more)
    parameters = {'vec__ngram_range': [(1, 1)],
                  'vec__min_df': [10, 15],
                  'trans__use_idf': [True]}

    # Create pipeline
    pipeline = Pipeline([('vec', CountVectorizer()),
                         ('trans', TfidfTransformer()),
                         ('clf', ExtraTreesClassifier())])

    # Gridsearch
    feat_clf = GridSearchCV(pipeline, parameters, n_jobs=3)

    # Fit
    feat_clf.fit(o_feat.values, oy.values)

    # List classes and create friendly labels
    feat_classes = feat_clf.best_estimator_.classes_
    feat_headers = np.add(['text_feat_'], [feat_classes]).flatten()


    # Get proba and add to original derived dataframe
    tdf1 = pd.DataFrame.from_records(feat_clf.predict_log_proba(o_feat),
                                     columns=feat_headers,
                                     index=oX.index)
    odX = pd.concat([odX, tdf1], axis=1)
    
    # Get proba and add to test derived dataframe.
    tdf2 = pd.DataFrame.from_records(feat_clf.predict_log_proba(t_feat),
                                     columns=feat_headers,
                                     index=tX.index)
    tdX = pd.concat([tdX, tdf2], axis=1)
    
    return [oy, oX, odX, tX, tdX]


all_X = create_features_probabilities(*all_X)









    



C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:628: RuntimeWarning: divide by zero encountered in log
  return np.log(proba)



In [4]:

    
all_X[4].head()









    Out[4]:






  
    
      
      text_feat_high
      text_feat_low
      text_feat_medium
    
    
      listing_id
      
      
      
    
  
  
    
      7142618
      -inf
      0.000000
      -inf
    
    
      7210040
      -3.954089
      -0.096874
      -2.615197
    
    
      7103890
      -inf
      -0.310155
      -1.321756
    
    
      7143442
      -inf
      -0.182322
      -1.791759
    
    
      6860601
      -inf
      0.000000
      -inf



In [5]:

    
def create_description_probabilities(oy, oX, odX, tX, tdX):
    '''Create probabilities for 'feature' column.'''

    # Make text vectors
    o_desc = oX['description']
    t_desc = tX['description']

    # Gridsearch Params
    parameters = {'vec__ngram_range': [(1, 3)],
                  'vec__min_df': [10],
                  'trans__use_idf': [True]}

    # Create pipeline
    pipeline = Pipeline([('vec', CountVectorizer()),
                         ('trans', TfidfTransformer()),
                         ('clf', ExtraTreesClassifier())])

    # Gridsearch
    desc_clf = GridSearchCV(pipeline, parameters, n_jobs=3)

    # Fit
    desc_clf.fit(o_desc.values, oy.values)

    # List classes and create friendly labels
    desc_classes = desc_clf.best_estimator_.classes_
    desc_headers = np.add(['text_desc_'], [desc_classes]).flatten()


    # Get proba and add to original derived dataframe -> temp df 1
    tdf1 = pd.DataFrame.from_records(desc_clf.predict_log_proba(o_desc),
                                     columns=desc_headers,
                                     index=oX.index)
    odX = pd.concat([odX, tdf1], axis=1)
    
    # Get proba and add to test derived dataframe -> temp df 2
    tdf2 = pd.DataFrame.from_records(desc_clf.predict_log_proba(t_desc),
                                     columns=desc_headers,
                                     index=tX.index)
    tdX = pd.concat([tdX, tdf2], axis=1)
    
    return [oy, oX, odX, tX, tdX]


all_X = create_description_probabilities(*all_X)









    



C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:628: RuntimeWarning: divide by zero encountered in log
  return np.log(proba)



In [6]:

    
all_X[4].head()









    Out[6]:






  
    
      
      text_feat_high
      text_feat_low
      text_feat_medium
      text_desc_high
      text_desc_low
      text_desc_medium
    
    
      listing_id
      
      
      
      
      
      
    
  
  
    
      7142618
      -inf
      0.000000
      -inf
      -inf
      -0.356675
      -1.203973
    
    
      7210040
      -3.954089
      -0.096874
      -2.615197
      -1.203973
      -0.510826
      -2.302585
    
    
      7103890
      -inf
      -0.310155
      -1.321756
      -inf
      -2.302585
      -0.105361
    
    
      7143442
      -inf
      -0.182322
      -1.791759
      -1.696449
      -0.836248
      -0.958850
    
    
      6860601
      -inf
      0.000000
      -inf
      -inf
      0.000000
      -inf



In [7]:

    
def create_dbscan_df(oy, oX, odX, tX, tdX):
    '''Create DBSCAN lookup database with probabilities.'''
    # Get locations
    locations = oX[['longitude', 'latitude']]

    # First cluster using DBSCAN
    cl_clf = DBSCAN(eps=.001, min_samples=10, n_jobs=3)
    cluster_designations = cl_clf.fit_predict(locations.values)
    
    # Create an dataframe with temp info
    tdf = pd.DataFrame({'interest_level': oy,
                        'cluster': cluster_designations,
                        'longitude': oX['longitude'],
                        'latitude': oX['latitude']})
    
    # Calculate cluster centers
    centers = tdf.groupby('cluster')[['longitude', 'latitude']].mean()

    # Break out clusters by interest level
    counts = tdf.groupby(['cluster', 'interest_level']).size()
    count_df = counts.unstack().fillna(0)
    proba_df = count_df.apply(lambda x: np.log(x/sum(x)), axis=1)

    # Concat into df
    dbscan_df = pd.concat([proba_df, centers], axis=1)
    dbscan_df = dbscan_df.add_prefix('dbscan_avg_')
    
    # Create kd tree for faster indexing
    kd_tree = KDTree(centers)
    
    return (dbscan_df, kd_tree)


dbscan_df, kd_tree = create_dbscan_df(*all_X)









    



C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:22: RuntimeWarning: divide by zero encountered in log



In [8]:

    
all_X[4].head()









    Out[8]:






  
    
      
      text_feat_high
      text_feat_low
      text_feat_medium
      text_desc_high
      text_desc_low
      text_desc_medium
    
    
      listing_id
      
      
      
      
      
      
    
  
  
    
      7142618
      -inf
      0.000000
      -inf
      -inf
      -0.356675
      -1.203973
    
    
      7210040
      -3.954089
      -0.096874
      -2.615197
      -1.203973
      -0.510826
      -2.302585
    
    
      7103890
      -inf
      -0.310155
      -1.321756
      -inf
      -2.302585
      -0.105361
    
    
      7143442
      -inf
      -0.182322
      -1.791759
      -1.696449
      -0.836248
      -0.958850
    
    
      6860601
      -inf
      0.000000
      -inf
      -inf
      0.000000
      -inf



In [9]:

    
def create_dbscan_probabilities(oy, oX, odX, tX, tdX, dbscan_df, kd_tree):
    '''Create probabilities for both derived dataframes.'''
    
    # Get nearest cluster from indices.
    train_indices = kd_tree.query(oX[['longitude', 'latitude']],
                                  return_distance=False,
                                  dualtree=True).flatten()

    # Merge indices with index
    train_cluster_df = pd.DataFrame({'cluster': train_indices}, index=oX.index)

    # Join df to eliminate cluster and add probs. 
    train_dbscan_probs = train_cluster_df.join(dbscan_df,
                                               how='left',
                                               on='cluster')[['dbscan_avg_high',
                                                              'dbscan_avg_low',
                                                              'dbscan_avg_medium']]

    # Append probabilities to original derived database for train
    odX = pd.concat([odX, train_dbscan_probs], axis=1)
    
    # Get the index for the closest cluster center for test.
    test_indices = kd_tree.query(tX[['longitude', 'latitude']],
                                 return_distance=False,
                                 dualtree=True).flatten()
    
    # Merge indices with index
    test_cluster_df = pd.DataFrame({'cluster': test_indices}, index=tX.index)

    # Join df to eliminate cluster and add probs. 
    test_dbscan_probs = test_cluster_df.join(dbscan_df,
                                             how='left',
                                             on='cluster')[['dbscan_avg_high',
                                                            'dbscan_avg_low',
                                                            'dbscan_avg_medium']]

    # Append probabilities to original derived database
    tdX = pd.concat([tdX, test_dbscan_probs], axis=1)

    return [oy, oX, odX, tX, tdX]


all_X = create_dbscan_probabilities(*all_X, dbscan_df, kd_tree)



In [10]:

    
all_X[4].head()









    Out[10]:






  
    
      
      text_feat_high
      text_feat_low
      text_feat_medium
      text_desc_high
      text_desc_low
      text_desc_medium
      dbscan_avg_high
      dbscan_avg_low
      dbscan_avg_medium
    
    
      listing_id
      
      
      
      
      
      
      
      
      
    
  
  
    
      7142618
      -inf
      0.000000
      -inf
      -inf
      -0.356675
      -1.203973
      -2.564949
      -0.282567
      -1.776492
    
    
      7210040
      -3.954089
      -0.096874
      -2.615197
      -1.203973
      -0.510826
      -2.302585
      -2.602690
      -0.730888
      -0.810930
    
    
      7103890
      -inf
      -0.310155
      -1.321756
      -inf
      -2.302585
      -0.105361
      -inf
      -0.143101
      -2.014903
    
    
      7143442
      -inf
      -0.182322
      -1.791759
      -1.696449
      -0.836248
      -0.958850
      -inf
      -0.271934
      -1.435085
    
    
      6860601
      -inf
      0.000000
      -inf
      -inf
      0.000000
      -inf
      -inf
      -0.318454
      -1.299283



In [11]:

    
def add_room_pricing_features(oy, oX, odX, tX, tdX):
    '''Add value-ish features to dataframe.'''
    # Create features
    train_pricing_df = pd.DataFrame({'dollars_per_bedroom': oX['bedrooms']/oX['price'],
                                     'dollars_per_bathroom': oX['bathrooms']/ oX['price'],
                                     'bathroom_percentage': oX['bathrooms'] / (oX['bedrooms'] + oX['bathrooms']),
                                     'price': oX['price'],
                                     'bathrooms': oX['bathrooms'],
                                     'bedrooms': oX['bedrooms']},
                                     index=oX.index)
    # Percentiles 
    train_pricing_df = (train_pricing_df.rank(axis=1, pct=True, method='dense')
                                        .applymap(lambda x: np.log(x)))

    # Append to original original derived dataframe
    odX = pd.concat([odX, train_pricing_df], axis=1)
    
    # Create features
    test_pricing_df = pd.DataFrame({'dollars_per_bedroom': tX['bedrooms']/ tX['price'],
                                    'dollars_per_bathroom': tX['bathrooms']/ tX['price'],
                                    'bathroom_percentage': tX['bathrooms'] / (tX['bedrooms'] + tX['bathrooms']),
                                    'price': tX['price'],
                                    'bathrooms': tX['bathrooms'],
                                    'bedrooms': tX['bedrooms']},
                                    index=tX.index)
    # Percentiles
    test_pricing_df = (test_pricing_df.rank(axis=1, pct=True, method='dense')
                                      .applymap(lambda x: np.log(x)))

    # Append to original original derived dataframe
    tdX = pd.concat([tdX, test_pricing_df], axis=1)
    
    return [oy, oX, odX, tX, tdX]


all_X = add_room_pricing_features(*all_X)



In [12]:

    
all_X[4].head()
cp_odX = all_X[2].copy()
cp_tdX = all_X[4].copy()



In [17]:

    
def create_boosted_clf(oy, oX, odX, tX, tdX):
    '''Create gbc, train gbc, cross validate, and return gbc.'''
    # Cast negative infinity to something model can work with
    odX = odX.replace(-np.inf, np.float32(9.9e-25))
    tdX = tdX.replace(-np.inf, np.float32(9.9e-25))
    # Fill na values with whatever.
    odX = odX.fillna(odX.mean())
    tdX = tdX.fillna(tdX.mean())
    le = LabelEncoder().fit(oy)
    # Grid search
    params = {#'loss': ['deviance', 'exponential'],
              # 'learning_rate': [1, 0.1, .01],
              # 'n_estimators': [100, 250],
              'subsample': [1.0, .5], 
              #'min_samples_split': [2, 5],
              #'min_samples_leaf': [1, 2, 5],
              'min_weight_fraction_leaf': [0.0, .2],
              #'max_depth': [1, 3, 5],
              'min_impurity_split': [1e-05, 1e-09]}
    gbc = GridSearchCV(GradientBoostingClassifier(),
                       params,
                       scoring='neg_log_loss',
                       n_jobs=3)
    gbc.fit(odX, le.transform(oy))
    # Score
    print(cross_val_score(gbc,
                          odX,
                          le.transform(oy),
                          scoring='accuracy'))
    
    return ([oy, oX, odX, tX, tdX], gbc, le)

    
all_X, gbc, le = create_boosted_clf(*all_X)









    



[ 0.72633882  0.72500152  0.72541033]



In [14]:

    
gbc









    Out[14]:





GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=3,
       param_grid={'subsample': [1.0, 0.5], 'min_weight_fraction_leaf': [0.0, 0.2], 'min_impurity_split': [1e-05, 1e-09]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_log_loss', verbose=0)



In [16]:

    
def output_predictions(oy, oX, odX, tX, tdX, gbc, le):
    '''Create the actual predictions and write to csv.'''
    # Run predictions and write to csv
    predictions = pd.DataFrame(gbc.predict_proba(tdX.values),
                               columns=le.classes_,
                               index=tdX.index)
    predictions = predictions[['high', 'medium', 'low']]
    predictions.to_csv('renthop_predictions.csv',
                       index_label='listing_id')
    return predictions


predictions = output_predictions(*all_X, gbc, le)
predictions.head(5)

	high	medium	low
listing_id
7142618	0.010191	0.178271	0.811537
7210040	0.127037	0.103861	0.769102
7103890	0.002051	0.487031	0.510918
7143442	0.008680	0.445127	0.546193
6860601	0.035208	0.137436	0.827356

	text_feat_high	text_feat_low	text_feat_medium
listing_id
7142618	-inf	0.000000	-inf
7210040	-3.954089	-0.096874	-2.615197
7103890	-inf	-0.310155	-1.321756
7143442	-inf	-0.182322	-1.791759
6860601	-inf	0.000000	-inf