Author: Theo Naunheim
Date: 2017-04-22
License: MIT license
Link: https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries
Create feature probabilities based on 'features' text and 'description' text.
Create feature probabilities based on longitude and latitude DBSCAN clusters.
Create features based on perceived value.
Throw all the features at a boosted gradient classfier and let god sort them out.
In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KDTree
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
In [2]:
# Original matrix
odf = pd.read_json('./train.json').set_index('listing_id')
oy = odf['interest_level']
oX = odf.drop('interest_level', axis=1)
# Test matrix
tX = pd.read_json('./test.json').set_index('listing_id')
# Original derived matrix
odX = pd.DataFrame(index=oX.index)
# Test-derived matrix
tdX = pd.DataFrame(index=tX.index)
# All matrices.
all_X = [oy, oX, odX, tX, tdX]
In [3]:
def create_features_probabilities(oy, oX, odX, tX, tdX):
'''Create probabilities for 'feature' column.'''
# Make text vectors
o_feat = oX['features'].map(lambda text_list: ' '.join(text_list))
t_feat = tX['features'].map(lambda text_list: ' '.join(text_list))
# Gridsearch Params (TODO, add more)
parameters = {'vec__ngram_range': [(1, 1)],
'vec__min_df': [10, 15],
'trans__use_idf': [True]}
# Create pipeline
pipeline = Pipeline([('vec', CountVectorizer()),
('trans', TfidfTransformer()),
('clf', ExtraTreesClassifier())])
# Gridsearch
feat_clf = GridSearchCV(pipeline, parameters, n_jobs=3)
# Fit
feat_clf.fit(o_feat.values, oy.values)
# List classes and create friendly labels
feat_classes = feat_clf.best_estimator_.classes_
feat_headers = np.add(['text_feat_'], [feat_classes]).flatten()
# Get proba and add to original derived dataframe
tdf1 = pd.DataFrame.from_records(feat_clf.predict_log_proba(o_feat),
columns=feat_headers,
index=oX.index)
odX = pd.concat([odX, tdf1], axis=1)
# Get proba and add to test derived dataframe.
tdf2 = pd.DataFrame.from_records(feat_clf.predict_log_proba(t_feat),
columns=feat_headers,
index=tX.index)
tdX = pd.concat([tdX, tdf2], axis=1)
return [oy, oX, odX, tX, tdX]
all_X = create_features_probabilities(*all_X)
In [4]:
all_X[4].head()
Out[4]:
In [5]:
def create_description_probabilities(oy, oX, odX, tX, tdX):
'''Create probabilities for 'feature' column.'''
# Make text vectors
o_desc = oX['description']
t_desc = tX['description']
# Gridsearch Params
parameters = {'vec__ngram_range': [(1, 3)],
'vec__min_df': [10],
'trans__use_idf': [True]}
# Create pipeline
pipeline = Pipeline([('vec', CountVectorizer()),
('trans', TfidfTransformer()),
('clf', ExtraTreesClassifier())])
# Gridsearch
desc_clf = GridSearchCV(pipeline, parameters, n_jobs=3)
# Fit
desc_clf.fit(o_desc.values, oy.values)
# List classes and create friendly labels
desc_classes = desc_clf.best_estimator_.classes_
desc_headers = np.add(['text_desc_'], [desc_classes]).flatten()
# Get proba and add to original derived dataframe -> temp df 1
tdf1 = pd.DataFrame.from_records(desc_clf.predict_log_proba(o_desc),
columns=desc_headers,
index=oX.index)
odX = pd.concat([odX, tdf1], axis=1)
# Get proba and add to test derived dataframe -> temp df 2
tdf2 = pd.DataFrame.from_records(desc_clf.predict_log_proba(t_desc),
columns=desc_headers,
index=tX.index)
tdX = pd.concat([tdX, tdf2], axis=1)
return [oy, oX, odX, tX, tdX]
all_X = create_description_probabilities(*all_X)
In [6]:
all_X[4].head()
Out[6]:
In [7]:
def create_dbscan_df(oy, oX, odX, tX, tdX):
'''Create DBSCAN lookup database with probabilities.'''
# Get locations
locations = oX[['longitude', 'latitude']]
# First cluster using DBSCAN
cl_clf = DBSCAN(eps=.001, min_samples=10, n_jobs=3)
cluster_designations = cl_clf.fit_predict(locations.values)
# Create an dataframe with temp info
tdf = pd.DataFrame({'interest_level': oy,
'cluster': cluster_designations,
'longitude': oX['longitude'],
'latitude': oX['latitude']})
# Calculate cluster centers
centers = tdf.groupby('cluster')[['longitude', 'latitude']].mean()
# Break out clusters by interest level
counts = tdf.groupby(['cluster', 'interest_level']).size()
count_df = counts.unstack().fillna(0)
proba_df = count_df.apply(lambda x: np.log(x/sum(x)), axis=1)
# Concat into df
dbscan_df = pd.concat([proba_df, centers], axis=1)
dbscan_df = dbscan_df.add_prefix('dbscan_avg_')
# Create kd tree for faster indexing
kd_tree = KDTree(centers)
return (dbscan_df, kd_tree)
dbscan_df, kd_tree = create_dbscan_df(*all_X)
In [8]:
all_X[4].head()
Out[8]:
In [9]:
def create_dbscan_probabilities(oy, oX, odX, tX, tdX, dbscan_df, kd_tree):
'''Create probabilities for both derived dataframes.'''
# Get nearest cluster from indices.
train_indices = kd_tree.query(oX[['longitude', 'latitude']],
return_distance=False,
dualtree=True).flatten()
# Merge indices with index
train_cluster_df = pd.DataFrame({'cluster': train_indices}, index=oX.index)
# Join df to eliminate cluster and add probs.
train_dbscan_probs = train_cluster_df.join(dbscan_df,
how='left',
on='cluster')[['dbscan_avg_high',
'dbscan_avg_low',
'dbscan_avg_medium']]
# Append probabilities to original derived database for train
odX = pd.concat([odX, train_dbscan_probs], axis=1)
# Get the index for the closest cluster center for test.
test_indices = kd_tree.query(tX[['longitude', 'latitude']],
return_distance=False,
dualtree=True).flatten()
# Merge indices with index
test_cluster_df = pd.DataFrame({'cluster': test_indices}, index=tX.index)
# Join df to eliminate cluster and add probs.
test_dbscan_probs = test_cluster_df.join(dbscan_df,
how='left',
on='cluster')[['dbscan_avg_high',
'dbscan_avg_low',
'dbscan_avg_medium']]
# Append probabilities to original derived database
tdX = pd.concat([tdX, test_dbscan_probs], axis=1)
return [oy, oX, odX, tX, tdX]
all_X = create_dbscan_probabilities(*all_X, dbscan_df, kd_tree)
In [10]:
all_X[4].head()
Out[10]:
In [11]:
def add_room_pricing_features(oy, oX, odX, tX, tdX):
'''Add value-ish features to dataframe.'''
# Create features
train_pricing_df = pd.DataFrame({'dollars_per_bedroom': oX['bedrooms']/oX['price'],
'dollars_per_bathroom': oX['bathrooms']/ oX['price'],
'bathroom_percentage': oX['bathrooms'] / (oX['bedrooms'] + oX['bathrooms']),
'price': oX['price'],
'bathrooms': oX['bathrooms'],
'bedrooms': oX['bedrooms']},
index=oX.index)
# Percentiles
train_pricing_df = (train_pricing_df.rank(axis=1, pct=True, method='dense')
.applymap(lambda x: np.log(x)))
# Append to original original derived dataframe
odX = pd.concat([odX, train_pricing_df], axis=1)
# Create features
test_pricing_df = pd.DataFrame({'dollars_per_bedroom': tX['bedrooms']/ tX['price'],
'dollars_per_bathroom': tX['bathrooms']/ tX['price'],
'bathroom_percentage': tX['bathrooms'] / (tX['bedrooms'] + tX['bathrooms']),
'price': tX['price'],
'bathrooms': tX['bathrooms'],
'bedrooms': tX['bedrooms']},
index=tX.index)
# Percentiles
test_pricing_df = (test_pricing_df.rank(axis=1, pct=True, method='dense')
.applymap(lambda x: np.log(x)))
# Append to original original derived dataframe
tdX = pd.concat([tdX, test_pricing_df], axis=1)
return [oy, oX, odX, tX, tdX]
all_X = add_room_pricing_features(*all_X)
In [12]:
all_X[4].head()
cp_odX = all_X[2].copy()
cp_tdX = all_X[4].copy()
In [17]:
def create_boosted_clf(oy, oX, odX, tX, tdX):
'''Create gbc, train gbc, cross validate, and return gbc.'''
# Cast negative infinity to something model can work with
odX = odX.replace(-np.inf, np.float32(9.9e-25))
tdX = tdX.replace(-np.inf, np.float32(9.9e-25))
# Fill na values with whatever.
odX = odX.fillna(odX.mean())
tdX = tdX.fillna(tdX.mean())
le = LabelEncoder().fit(oy)
# Grid search
params = {#'loss': ['deviance', 'exponential'],
# 'learning_rate': [1, 0.1, .01],
# 'n_estimators': [100, 250],
'subsample': [1.0, .5],
#'min_samples_split': [2, 5],
#'min_samples_leaf': [1, 2, 5],
'min_weight_fraction_leaf': [0.0, .2],
#'max_depth': [1, 3, 5],
'min_impurity_split': [1e-05, 1e-09]}
gbc = GridSearchCV(GradientBoostingClassifier(),
params,
scoring='neg_log_loss',
n_jobs=3)
gbc.fit(odX, le.transform(oy))
# Score
print(cross_val_score(gbc,
odX,
le.transform(oy),
scoring='accuracy'))
return ([oy, oX, odX, tX, tdX], gbc, le)
all_X, gbc, le = create_boosted_clf(*all_X)
In [14]:
gbc
Out[14]:
In [16]:
def output_predictions(oy, oX, odX, tX, tdX, gbc, le):
'''Create the actual predictions and write to csv.'''
# Run predictions and write to csv
predictions = pd.DataFrame(gbc.predict_proba(tdX.values),
columns=le.classes_,
index=tdX.index)
predictions = predictions[['high', 'medium', 'low']]
predictions.to_csv('renthop_predictions.csv',
index_label='listing_id')
return predictions
predictions = output_predictions(*all_X, gbc, le)
predictions.head(5)
Out[16]: