Project: McNulty
Date: 02/22/2017
Name: Prashant Tatineni
In this project, I attempt to predict the popularity (target variable: interest_level) of apartment rental listings based on listing characteristics. The data comes from a Kaggle Competition.
AWS and SQL were not used for joining to the dataset, as it was provided as a single file train.json (49,352 rows).
An additional file, test.json (74,659 rows) contains the same columns as train.json, except that the target variable, interest_level, is missing. Predictions of the target variable are to be made on the test.json file and submitted to Kaggle.
interest_level as the target.interest_level for the available test dataset.
In [18]:
# imports
import pandas as pd
import dateutil.parser
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
%matplotlib inline
In [2]:
# Load the training dataset from Kaggle.
df = pd.read_json('data/raw/train.json')
print df.shape
In [3]:
df.head(2)
Out[3]:
Total number of columns is 14 + 1 target:
Features for modeling:
Further opportunities for modeling:
In [6]:
# Distribution of target, interest_level
s = df.groupby('interest_level')['listing_id'].count()
s.plot.bar();
In [9]:
df_high = df.loc[df['interest_level'] == 'high']
df_medium = df.loc[df['interest_level'] == 'medium']
df_low = df.loc[df['interest_level'] == 'low']
In [17]:
plt.figure(figsize=(6,10))
plt.scatter(df_low.longitude, df_low.latitude, color='yellow', alpha=0.2, marker='.', label='Low')
plt.scatter(df_medium.longitude, df_medium.latitude, color='green', alpha=0.2, marker='.', label='Medium')
plt.scatter(df_high.longitude, df_high.latitude, color='purple', alpha=0.2, marker='.', label='High')
plt.xlim(-74.04,-73.80)
plt.ylim(40.6,40.9)
plt.legend(loc=2);
In [22]:
(pd.to_datetime(df['created'])).sort_values(ascending=False).head()
Out[22]:
In [24]:
# The most recent records are 6/29/2016. Computing days old from 6/30/2016.
df['days_old'] = (dateutil.parser.parse('2016-06-30') - pd.to_datetime(df['created'])).apply(lambda x: x.days)
In [25]:
# Add other "count" features
df['num_words'] = df['description'].apply(lambda x: len(x.split()))
df['num_features'] = df['features'].apply(len)
df['num_photos'] = df['photos'].apply(len)
In [62]:
X = df[['bathrooms','bedrooms','price','latitude','longitude','days_old','num_words','num_features','num_photos']]
y = df['interest_level']
In [63]:
# Scaling is necessary for Logistic Regression and KNN
X_scaled = pd.DataFrame(preprocessing.scale(X))
X_scaled.columns = X.columns
In [64]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=42)
In [65]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
Out[65]:
In [66]:
y_test_predicted_proba = lr.predict_proba(X_test)
log_loss(y_test, y_test_predicted_proba)
Out[66]:
In [67]:
lr = LogisticRegression(solver='newton-cg', multi_class='multinomial')
lr.fit(X_train, y_train)
Out[67]:
In [68]:
y_test_predicted_proba = lr.predict_proba(X_test)
log_loss(y_test, y_test_predicted_proba)
Out[68]:
In [69]:
for i in [95,100,105]:
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
y_test_predicted_proba = knn.predict_proba(X_test)
print log_loss(y_test, y_test_predicted_proba)
In [70]:
rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
rf.fit(X_train, y_train)
Out[70]:
In [71]:
y_test_predicted_proba = rf.predict_proba(X_test)
log_loss(y_test, y_test_predicted_proba)
Out[71]:
In [72]:
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
Out[72]:
In [73]:
y_test_predicted_proba = bnb.predict_proba(X_test)
log_loss(y_test, y_test_predicted_proba)
Out[73]:
In [74]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
Out[74]:
In [75]:
y_test_predicted_proba = gnb.predict_proba(X_test)
log_loss(y_test, y_test_predicted_proba)
Out[75]:
In [76]:
clf = MLPClassifier(hidden_layer_sizes=(100,50,10))
clf.fit(X_train, y_train)
Out[76]:
In [77]:
y_test_predicted_proba = clf.predict_proba(X_test)
log_loss(y_test, y_test_predicted_proba)
Out[77]:
In [26]:
# Reduce 1556 unique category text values into 34 main categories
def reduce_categories(full_list):
reduced_list = []
for i in full_list:
item = i.lower()
if 'cats allowed' in item:
reduced_list.append('cats')
if 'dogs allowed' in item:
reduced_list.append('dogs')
if 'elevator' in item:
reduced_list.append('elevator')
if 'hardwood' in item:
reduced_list.append('elevator')
if 'doorman' in item or 'concierge' in item:
reduced_list.append('doorman')
if 'dishwasher' in item:
reduced_list.append('dishwasher')
if 'laundry' in item or 'dryer' in item:
if 'unit' in item:
reduced_list.append('laundry_in_unit')
else:
reduced_list.append('laundry')
if 'no fee' in item:
reduced_list.append('no_fee')
if 'reduced fee' in item:
reduced_list.append('reduced_fee')
if 'fitness' in item or 'gym' in item:
reduced_list.append('gym')
if 'prewar' in item or 'pre-war' in item:
reduced_list.append('prewar')
if 'dining room' in item:
reduced_list.append('dining')
if 'pool' in item:
reduced_list.append('pool')
if 'internet' in item:
reduced_list.append('internet')
if 'new construction' in item:
reduced_list.append('new_construction')
if 'wheelchair' in item:
reduced_list.append('wheelchair')
if 'exclusive' in item:
reduced_list.append('exclusive')
if 'loft' in item:
reduced_list.append('loft')
if 'simplex' in item:
reduced_list.append('simplex')
if 'fire' in item:
reduced_list.append('fireplace')
if 'lowrise' in item or 'low-rise' in item:
reduced_list.append('lowrise')
if 'midrise' in item or 'mid-rise' in item:
reduced_list.append('midrise')
if 'highrise' in item or 'high-rise' in item:
reduced_list.append('highrise')
if 'pool' in item:
reduced_list.append('pool')
if 'ceiling' in item:
reduced_list.append('high_ceiling')
if 'garage' in item or 'parking' in item:
reduced_list.append('parking')
if 'furnished' in item:
reduced_list.append('furnished')
if 'multi-level' in item:
reduced_list.append('multilevel')
if 'renovated' in item:
reduced_list.append('renovated')
if 'super' in item:
reduced_list.append('live_in_super')
if 'green building' in item:
reduced_list.append('green_building')
if 'appliances' in item:
reduced_list.append('new_appliances')
if 'luxury' in item:
reduced_list.append('luxury')
if 'penthouse' in item:
reduced_list.append('penthouse')
if 'deck' in item or 'terrace' in item or 'balcony' in item or 'outdoor' in item or 'roof' in item or 'garden' in item or 'patio' in item:
reduced_list.append('outdoor_space')
return list(set(reduced_list))
In [27]:
df['categories'] = df['features'].apply(reduce_categories)
In [30]:
text = ''
for index, row in df.iterrows():
for i in row.categories:
text = text + i + ' '
In [32]:
plt.figure(figsize=(12,6))
wc = WordCloud(background_color='white', width=1200, height=600).generate(text)
plt.title('Reduced Categories', fontsize=30)
plt.axis("off")
wc.recolor(random_state=0)
plt.imshow(wc);
In [ ]:
# Create indicators
X_dummies = pd.get_dummies(df['categories'].apply(pd.Series).stack()).sum(level=0)
In [79]:
# Choose features for modeling (and sorting)
df = df.sort_values('listing_id')
X = df[['bathrooms','bedrooms','price','latitude','longitude','days_old','num_words','num_features','num_photos','listing_id','manager_id']]
y = df['interest_level']
In [82]:
# Merge indicators to X dataframe and sort again to match sorting of y
X = X.merge(X_dummies, how='outer', left_index=True, right_index=True).fillna(0)
X = X.sort_values('listing_id')
In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
In [103]:
# compute ratios and count for each manager
mgr_perf = pd.concat([X_train.manager_id,pd.get_dummies(y_train)], axis=1).groupby('manager_id').mean()
In [104]:
mgr_perf.head(2)
Out[104]:
In [105]:
mgr_perf['manager_count'] = X_train.groupby('manager_id').count().iloc[:,1]
mgr_perf['manager_skill'] = mgr_perf['high']*1 + mgr_perf['medium']*0 + mgr_perf['low']*-1
In [106]:
# for training set
X_train = X_train.merge(mgr_perf.reset_index(), how='left', left_on='manager_id', right_on='manager_id')
In [109]:
# for test set
X_test = X_test.merge(mgr_perf.reset_index(), how='left', left_on='manager_id', right_on='manager_id')
# Fill na's with mean skill and median count
X_test['manager_skill'] = X_test.manager_skill.fillna(X_test.manager_skill.mean())
X_test['manager_count'] = X_test.manager_count.fillna(X_test.manager_count.median())
In [111]:
# Delete unnecessary columns before modeling
del X_train['listing_id']
del X_train['manager_id']
del X_test['listing_id']
del X_test['manager_id']
del X_train['high']
del X_train['medium']
del X_train['low']
del X_test['high']
del X_test['medium']
del X_test['low']
In [112]:
rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
rf.fit(X_train, y_train)
Out[112]:
In [114]:
y_test_predicted_proba = rf.predict_proba(X_test)
log_loss(y_test, y_test_predicted_proba)
Out[114]:
In [115]:
y_test_predicted = rf.predict(X_test)
accuracy_score(y_test, y_test_predicted)
Out[115]:
In [116]:
precision_recall_fscore_support(y_test, y_test_predicted)
Out[116]:
In [117]:
rf.classes_
Out[117]:
In [122]:
plt.figure(figsize=(15,5))
pd.Series(index = X_train.columns, data = rf.feature_importances_).sort_values().plot(kind = 'bar');
As seen here, introducing feature categories and manager performance has improved the model. In particular, manager_skill shows up as the dominant feature in terms of importance in this random forest model.
To make a prediction for submission to Kaggle, this notebook is recreated with the test.json dataset. The submission requires the predicted high, medium, and low probabilities for each listing_id.
Further opportunities to improve prediction on this dataset lie in the text descriptions and image data, which were not used thus far except as a numerical "length" value. Building popularity could also be assessed via the building_id variable, similar to the aggregation of the manager_id variable.
It will also be valuable to spend time optimizing the model used here (perhaps using GridSearch).