In [2]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import json
In [3]:
with open('../pdxapartmentfinder/data/MasterApartmentData.json') as f:
my_dict = json.load(f)
dframe = DataFrame(my_dict)
dframe = dframe.T
dframe.shape
Out[3]:
In [3]:
dframe.bath = dframe.bath.replace('shared',0.5)
dframe.bath = dframe.bath.replace('split',0.5)
dframe.smoking = dframe.smoking.replace(np.nan,0)
To visualize it we need to get rid of null values. I haven't figured out the best way to clean this up yet. For now I'm going to drop any rows that have a null value, though I recognize that this is not a good analysis practice. We ended up dropping ~15% of data points.
😬
Also there were some CRAZY outliers, and this analysis is focused on finding a model for apartments for the 99% of us that can't afford crazy extravigant apartments
In [4]:
df = dframe[dframe.price < 10000][['bath','bed','feet','price']].dropna()
sns.distplot(df.price)
Out[4]:
In [5]:
dframe.head()
Out[5]:
In [6]:
dframe.describe()
Out[6]:
In [7]:
def meanimputer(column):
imp = Imputer(missing_values='NaN', strategy='mean', axis=1)
imp.fit(column)
X = imp.transform(column)
return X[0]
In [148]:
from sklearn import preprocessing
def modeimputer(column):
le = preprocessing.LabelEncoder()
column = le.fit_transform(column)
print type(le.classes_[0])
nan = le.transform([np.nan])[0]
column = list(column)
for _,i in enumerate(column):
if i == nan:
column[_] = np.nan
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=1)
imp.fit(column)
X = imp.transform(column)
for _,i in enumerate(X[0]):
if np.isnan(i):
X[_] = 0
X = X.astype(int)
Y = le.inverse_transform(X)
return Y
In [159]:
arr = np.array([np.nan, 'house', 'boat', 'houseboat', 'house', np.nan, 'house','houseboat'])
prac_df = DataFrame()
prac_df['arr'] = arr
prac_df['arr']
modeimputer(prac_df['arr'])
In [ ]:
In [145]:
pd.isnull('nan')
Out[145]:
In [158]:
u_dframe = DataFrame()
dframe['bath'] = meanimputer(dframe['bath'])
dframe['bed'] = meanimputer(dframe['bed'])
dframe['feet'] = meanimputer(dframe['feet'])
dframe['lat'] = meanimputer(dframe['lat'])
dframe['long'] = meanimputer(dframe['long'])
dframe['housingtype'] = modeimputer(dframe['housingtype'])
dframe['laundry'] = modeimputer(dframe['laundry'])
dframe['parking'] = modeimputer(dframe['parking'])
dframe['wheelchair'] = modeimputer(dframe['wheelchair'])
In [10]:
dframe.head()
Out[10]:
In [11]:
dframe.describe(include='all')
Out[11]:
In [ ]:
data = dframe[dframe.lat > 45.4][dframe.lat < 45.6][dframe.long < -122.0][dframe.long > -123.5]
plt.figure(figsize=(15,10))
plt.scatter(data = data, x = 'long',y='lat')
In [ ]:
XYdf = dframe[dframe.lat > 45.4][dframe.lat < 45.6][dframe.long < -122.0][dframe.long > -123.5]
data = [[XYdf['lat'][i],XYdf['long'][i]] for i in XYdf.index]
We'll use K Means Clustering because that's the clustering method I recently learned in class! There may be others that work better, but this is the tool that I know
In [ ]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=40)
km.fit(data)
neighborhoods = km.cluster_centers_
In [ ]:
%pylab inline
figure(1,figsize=(20,12))
plot([row[1] for row in data],[row[0] for row in data],'b.')
for i in km.cluster_centers_:
plot(i[1],i[0], 'g*',ms=25)
'''Note to Riley: come back and make it look pretty'''
In [ ]:
neighborhoods = neighborhoods.tolist()
for i in enumerate(neighborhoods):
i[1].append(i[0])
print neighborhoods
Create a function that will label each point with a number coresponding to it's neighborhood
In [ ]:
def clusterer(X, Y,neighborhoods):
neighbors = []
for i in neighborhoods:
distance = ((i[0]-X)**2 + (i[1]-Y)**2)
neighbors.append(distance)
closest = min(neighbors)
return neighbors.index(closest)
In [ ]:
neighborhoodlist = []
for i in dframe.index:
neighborhoodlist.append(clusterer(dframe['lat'][i],dframe['long'][i],neighborhoods))
dframe['neighborhood'] = neighborhoodlist
In [ ]:
dframe
In [ ]:
In [ ]:
from sklearn import preprocessing
def CategoricalToBinary(dframe,column_name):
le = preprocessing.LabelEncoder()
listy = le.fit_transform(dframe[column_name])
dframe[column_name] = listy
unique = dframe[column_name].unique()
serieslist = [list() for _ in xrange(len(unique))]
for column, _ in enumerate(serieslist):
for i, item in enumerate(dframe[column_name]):
if item == column:
serieslist[column].append(1)
else:
serieslist[column].append(0)
dframe[column_name+str(column)] = serieslist[column]
return dframe
In [ ]:
pd.set_option('max_columns', 100)
dframe = CategoricalToBinary(dframe,'housingtype')
dframe = CategoricalToBinary(dframe,'parking')
dframe = CategoricalToBinary(dframe,'laundry')
dframe = CategoricalToBinary(dframe,'smoking')
dframe = CategoricalToBinary(dframe,'wheelchair')
dframe = CategoricalToBinary(dframe,'neighborhood')
dframe
In [ ]:
dframe = dframe.drop('date',1)
dframe = dframe.drop('housingtype',1)
dframe = dframe.drop('parking',1)
dframe = dframe.drop('laundry',1)
dframe = dframe.drop('smoking',1)
dframe = dframe.drop('wheelchair',1)
dframe = dframe.drop('neighborhood',1)
dframe = dframe.drop('time',1)
In [ ]:
columns=list(dframe.columns)
In [ ]:
from __future__ import division
print len(dframe)
df2 = dframe[dframe.price < 10000][columns].dropna()
print len(df2)
print len(df2)/len(dframe)
price = df2[['price']].values
columns.pop(columns.index('price'))
features = df2[columns].values
from sklearn.cross_validation import train_test_split
features_train, features_test, price_train, price_test = train_test_split(features, price, test_size=0.1, random_state=42)
Ok, lets put it through Decision Tree!
In [ ]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
reg = RandomForestRegressor()
reg = reg.fit(features_train, price_train)
In [ ]:
forest_pred = reg.predict(features_test)
forest_pred = np.array([[item] for item in forest_pred])
In [ ]:
print r2_score(forest_pred, price_test)
plt.scatter(forest_pred,price_test)
In [ ]:
df2['predictions'] = reg.predict(df2[columns])
In [ ]:
df2['predictions_diff'] = df2['predictions']-df2['price']
In [ ]:
sd = np.std(df2['predictions_diff'])
sns.kdeplot(df2['predictions_diff'][df2['predictions_diff']>-150][df2['predictions_diff']<150])
sns.plt.xlim(-150,150)
In [ ]:
data = df2[dframe.lat > 45.45][df2.lat < 45.6][df2.long < -122.4][df2.long > -122.8][df2['predictions_diff']>-150][df2['predictions_diff']<150]
plt.figure(figsize=(15,10))
plt.scatter(data = data, x = 'long',y='lat', c = 'predictions_diff',s=10,cmap='coolwarm')
In [ ]:
dframe
In [ ]:
print np.mean([1,2,34,np.nan])
In [ ]:
def averager(dframe):
dframe = dframe.T
dframe.dropna()
averages = {}
for listing in dframe:
try:
key = str(dframe[listing]['bed'])+','+str(dframe[listing]['bath'])+','+str(dframe[listing]['neighborhood'])+','+str(dframe[listing]['feet']-dframe[listing]['feet']%50)
if key not in averages:
averages[key] = {'average_list':[dframe[listing]['price']], 'average':0}
elif key in averages:
averages[key]['average_list'].append(dframe[listing]['price'])
except TypeError:
continue
for entry in averages:
averages[entry]['average'] = np.mean(averages[entry]['average_list'])
return averages
In [ ]:
averages = averager(dframe)
print averages
In [ ]:
dframe['averages']= averages[str(dframe['bed'])+','+str(dframe['bath'])+','+str(dframe['neighborhood'])+','+str(dframe['feet']-dframe['feet']%50)]
In [ ]:
dframe.T
Wow! up to .87! That's our best yet! What if we add more trees???
In [ ]:
reg = RandomForestRegressor(n_estimators = 100)
reg = reg.fit(features_train, price_train)
In [ ]:
forest_pred = reg.predict(features_test)
forest_pred = np.array([[item] for item in forest_pred])
In [ ]:
print r2_score(forest_pred, price_test)
print plt.scatter(pred,price_test)
In [ ]:
In [ ]:
from sklearn.tree import DecisionTreeRegressor
reg = DecisionTreeRegressor(max_depth = 5)
reg.fit(features_train, price_train)
print len(features_train[0])
columns = [str(x) for x in columns]
print columns
from sklearn.tree import export_graphviz
export_graphviz(reg,feature_names=columns)
Up to .88!
So what is our goal now? I'd like to see if adjusting the number of neighborhoods increases the accuracy. same for the affect with the number of trees
In [ ]:
def neighborhood_optimizer(dframe,neighborhood_number_range, counter_num):
XYdf = dframe[dframe.lat > 45.4][dframe.lat < 45.6][dframe.long < -122.0][dframe.long > -123.5]
data = [[XYdf['lat'][i],XYdf['long'][i]] for i in XYdf.index]
r2_dict = []
for i in neighborhood_number_range:
counter = counter_num
average_accuracy_list = []
while counter > 0:
km = KMeans(n_clusters=i)
km.fit(data)
neighborhoods = km.cluster_centers_
neighborhoods = neighborhoods.tolist()
for x in enumerate(neighborhoods):
x[1].append(x[0])
neighborhoodlist = []
for z in dframe.index:
neighborhoodlist.append(clusterer(dframe['lat'][z],dframe['long'][z],neighborhoods))
dframecopy = dframe.copy()
dframecopy['neighborhood'] = Series((neighborhoodlist), index=dframe.index)
df2 = dframecopy[dframe.price < 10000][['bath','bed','feet','dog','cat','content','getphotos', 'hasmap', 'price','neighborhood']].dropna()
features = df2[['bath','bed','feet','dog','cat','content','getphotos', 'hasmap', 'neighborhood']].values
price = df2[['price']].values
features_train, features_test, price_train, price_test = train_test_split(features, price, test_size=0.1)
reg = RandomForestRegressor()
reg = reg.fit(features_train, price_train)
forest_pred = reg.predict(features_test)
forest_pred = np.array([[item] for item in forest_pred])
counter -= 1
average_accuracy_list.append(r2_score(forest_pred, price_test))
total = 0
for entry in average_accuracy_list:
total += entry
r2_accuracy = total/len(average_accuracy_list)
r2_dict.append((i,r2_accuracy))
print r2_dict
return r2_dict
In [ ]:
neighborhood_number_range = [i for _,i in enumerate(range(2,31,2))]
neighborhood_number_range
In [ ]:
r2_dict = neighborhood_optimizer(dframe,neighborhood_number_range,10)
In [ ]:
r2_dict[:][0]
In [ ]:
plt.scatter([x[0] for x in r2_dict],[x[1] for x in r2_dict])
Looks like the optimum is right around 10 or 11, and then starts to drop off. Let's get a little more granular and look at a smaller range
In [ ]:
neighborhood_number_range = [i for _,i in enumerate(range(7,15))]
neighborhood_number_range
In [ ]:
r2_dict = neighborhood_optimizer(dframe,neighborhood_number_range,10)
In [ ]:
print r2_dict
plt.scatter([x[0] for x in r2_dict],[x[1] for x in r2_dict])
Trying a few times, it looks like 10, 11 and 12 get the best results at ~.85. Of course, we'll need to redo some of these optomizations after we properly process our data. Hopefully we'll see some more consistency then too.
In [ ]:
r2_dict = neighborhood_optimizer(dframe,[10,11,12],25)
Note #1 to Riley: (From Last time) Perhaps look into another regressor? see if there's one that's inherantly better at this kind of thing.
Note #2 to Riley: Figure out how to process data so that you don't have to drop null values
Note #3 to Riley: convert categorical data into binary
Note #4 to Riley: I wonder if increasing the number of neighborhoods would become more accurate as we collect more data? like you could create a bunch of little accurate models instead of a bunch of bigger ones.
Learned: If you plan on using Decision Tree/Random Forest from SKLearn, make sure you collect your discrete variables in separate columns and make them binary yes or no(0 or 1).