In [355]:
import pandas
import numpy as np
import scipy

import pickle

from itertools import islice

from zillow_hackathon.dataset import Neighborhood

import sklearn.preprocessing
import scipy.spatial.distance
import sklearn.feature_extraction
import sklearn.cluster

from sklearn.externals import joblib

import pylab as plt
from mpl_toolkits.mplot3d import Axes3D

%matplotlib inline

Vectorize Data


In [356]:
def get_features(n):
    # return dict(n.data['num_attrs'].items())
    return dict(n.data['spec_attrs'].items())    
    # return dict(n.data['num_attrs'].items() + n.data['spec_attrs'].items())    
    # return { 'price': + n.data['num_attrs']['median_list_price'] }

In [357]:
sea_nbrs = list(Neighborhood.get_neighborhoods_in_city('Seattle'))
sf_nbrs = list(Neighborhood.get_neighborhoods_in_city('San Francisco'))

nbrs = sea_nbrs + sf_nbrs
sf_start_idx = len(sea_nbrs)

nbrs_features = [get_features(n) for n in nbrs]
vectorizer = sklearn.feature_extraction.DictVectorizer(sparse=False) 

vectorizer.fit(nbrs_features)
nbrs_features = vectorizer.transform(nbrs_features)

Normalize


In [358]:
min_max_scaler = sklearn.preprocessing.MinMaxScaler()
min_max_scaler.fit(nbrs_features)
nbrs_features = min_max_scaler.transform(nbrs_features)

Reduce Dimensionality


In [359]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

LSA_DIMENSIONS = 10
 
svd = TruncatedSVD(LSA_DIMENSIONS)
lsa = make_pipeline(svd, Normalizer(copy=False))
# lsa = svd

LSA_ENABLED = True

if LSA_ENABLED:
    lsa.fit(nbrs_features)

def reduce_dim(v):
    if LSA_ENABLED:
        return lsa.transform(v)
    else:
        return v

nbrs_features_reduced = reduce_dim(nbrs_features)

Find Similar Items


In [360]:
def find_similar(v1, where):
    res = []

    for i, v2 in enumerate(where):
        d = 1 - scipy.spatial.distance.cosine(v1, v2)
        # d = 1 - scipy.spatial.distance.euclidean(v1, v2)        
        # d = 1 - scipy.spatial.distance.jaccard(v1, v2)        
        # d = np.dot(v1, v2)
        
        res.append((i, d))

    return sorted(res, key=lambda p: [ -p[1] ])  

def find_top_similar(city, name, start_idx = 0):
    x = Neighborhood.get_for_city_and_neighborhood(city, name)
    x = reduce_dim(min_max_scaler.transform(vectorizer.transform(get_features(x))))      

    top_similar = find_similar(x, nbrs_features_reduced[start_idx:])
    nbrs_attrs = nbrs[start_idx:]

    print "Results for %s" % name
    for r in islice(top_similar, 5):
        id = r[0]
        d = r[1]

        print "#%d: %s, %s: %f" % (id, nbrs_attrs[id].name, nbrs_attrs[id].city, d)    
    print
    
find_top_similar('Seattle', 'Capitol Hill')
find_top_similar('Seattle', 'University District')
find_top_similar('Seattle', 'Northgate')
find_top_similar('Seattle', 'International District')
find_top_similar('Seattle', 'Ballard')

find_top_similar('Seattle', 'Capitol Hill', sf_start_idx)
find_top_similar('Seattle', 'University District', sf_start_idx)
find_top_similar('Seattle', 'Northgate', sf_start_idx)
find_top_similar('Seattle', 'International District', sf_start_idx)
find_top_similar('Seattle', 'Ballard', sf_start_idx)


Results for Capitol Hill
#40: Capitol Hill, Seattle: 1.000000
#59: Madrona, Seattle: 0.927815
#60: Leschi, Seattle: 0.920813
#96: Haight-Ashbury, San Francisco: 0.906331
#48: Eastlake, Seattle: 0.897862

Results for University District
#23: University District, Seattle: 1.000000
#17: Roosevelt, Seattle: 0.968666
#24: Wallingford, Seattle: 0.919612
#18: Ravenna, Seattle: 0.913614
#42: Broadway, Seattle: 0.897299

Results for Northgate
#5: Northgate, Seattle: 1.000000
#1: Bitter Lake, Seattle: 0.838865
#8: North College Park, Seattle: 0.821499
#72: South Beacon Hill, Seattle: 0.791077
#10: Cedar Park, Seattle: 0.783397

Results for International District
#53: International District, Seattle: 1.000000
#91: Chinatown, San Francisco: 0.943900
#52: Pioneer Square, Seattle: 0.880095
#63: Brighton, Seattle: 0.872657
#71: Holly Park, Seattle: 0.826490

Results for Ballard
#28: Ballard, Seattle: 1.000000
#114: Western Addition, San Francisco: 0.941970
#31: Loyal Heights, Seattle: 0.910955
#30: Sunset Hill, Seattle: 0.896611
#36: North Queen Anne, Seattle: 0.885240

Results for Capitol Hill
#7: Haight-Ashbury, San Francisco: 0.906331
#8: Inner Sunset, San Francisco: 0.815526
#18: Potrero Hill, San Francisco: 0.814947
#16: Pacific Heights, San Francisco: 0.805592
#25: Western Addition, San Francisco: 0.804322

Results for University District
#10: Nob Hill, San Francisco: 0.773800
#25: Western Addition, San Francisco: 0.758886
#7: Haight-Ashbury, San Francisco: 0.730630
#8: Inner Sunset, San Francisco: 0.722136
#12: North Beach, San Francisco: 0.670617

Results for Northgate
#4: Excelsior, San Francisco: 0.748200
#2: Chinatown, San Francisco: 0.741198
#24: Visitacion Valley, San Francisco: 0.734050
#0: Bayview, San Francisco: 0.706928
#13: Oceanview, San Francisco: 0.685424

Results for International District
#2: Chinatown, San Francisco: 0.943900
#5: Financial District, San Francisco: 0.729878
#22: South of Market, San Francisco: 0.633850
#10: Nob Hill, San Francisco: 0.614599
#1: Bernal Heights, San Francisco: 0.613112

Results for Ballard
#25: Western Addition, San Francisco: 0.941970
#18: Potrero Hill, San Francisco: 0.837834
#10: Nob Hill, San Francisco: 0.830548
#22: South of Market, San Francisco: 0.807443
#7: Haight-Ashbury, San Francisco: 0.805971

Clustering Helpers


In [344]:
#
# Creates dictionary of cluster-to-item mapping
#
def map_clusters_to_items(cls):
    cluster_to_item_map = dict()

    for item_id in range(len(cls.labels_)):
        cluster_id = cls.labels_[item_id]

        if not cluster_id in cluster_to_item_map:
            cluster_to_item_map[cluster_id] = []

        cluster_to_item_map[cluster_id].append(item_id)

    return cluster_to_item_map

#
# Print clusters
#
def print_clusters(cls):
    clusters = map_clusters_to_items(cls)
    for cluster_id, items in sorted(clusters.iteritems(), key = lambda kv: [ -len(kv[1]) ]):
        print("Cluster #%d: %d items" % (cluster_id, len(items)))

        for item_id in islice(items, 5):
            item = nbrs[item_id]
            print("\t%s, %s" % (item.data['name'], item.data['city']))
            print
        print

Run DBSCAN Clustering (No Fixed # of Clusters)


In [351]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.3, min_samples=3)
dbscan.fit(nbrs_features_reduced)

print_clusters(dbscan)


Cluster #-1: 81 items
	Broadview, Seattle

	Bitter Lake, Seattle

	North Beach, Seattle

	Crown Hill, Seattle

	Greenwood, Seattle


Cluster #2: 15 items
	Roosevelt, Seattle

	Ravenna, Seattle

	University District, Seattle

	Wallingford, Seattle

	Green Lake, Seattle


Cluster #0: 6 items
	Brighton, Seattle

	Beacon Hill, Seattle

	Holly Park, Seattle

	South Beacon Hill, Seattle

	South Park, Seattle


Cluster #1: 6 items
	Magnolia, Seattle

	Lawton Park, Seattle

	West Queen Anne, Seattle

	Industrial District, Seattle

	Admiral, Seattle


Cluster #4: 4 items
	Noe Valley, San Francisco

	Pacific Heights, San Francisco

	Presidio Heights, San Francisco

	Twin Peaks, San Francisco


Cluster #3: 3 items
	Downtown, Seattle

	Belltown, Seattle

	Waterfront, Seattle


Run K-Means Clustering (Fixed # of Clusters)


In [345]:


In [346]:
km = sklearn.cluster.KMeans(n_clusters=10, init='k-means++', max_iter=100, n_init=1,
                verbose=True)

km.fit(nbrs_features_reduced)
print_clusters(km)


Initialization complete
Iteration  0, inertia 26.889
Iteration  1, inertia 20.721
Iteration  2, inertia 19.874
Iteration  3, inertia 19.383
Iteration  4, inertia 19.105
Iteration  5, inertia 18.969
Iteration  6, inertia 18.937
Converged at iteration 6
Cluster #3: 28 items
	Phinney Ridge, Seattle

	Magnolia, Seattle

	Lawton Park, Seattle

	Interbay, Seattle

	North Queen Anne, Seattle


Cluster #5: 14 items
	North Beach, Seattle

	Matthews Beach, Seattle

	Wedgwood, Seattle

	View Ridge, Seattle

	Bryant, Seattle


Cluster #0: 12 items
	Crown Hill, Seattle

	Greenwood, Seattle

	Ballard, Seattle

	Whittier Heights, Seattle

	Sunset Hill, Seattle


Cluster #1: 11 items
	Brighton, Seattle

	Dunlap, Seattle

	Rainier Beach, Seattle

	Beacon Hill, Seattle

	North Beacon Hill, Seattle


Cluster #7: 11 items
	Atlantic, Seattle

	Judkins Park, Seattle

	Columbia City, Seattle

	Hillman City, Seattle

	Seward Park, Seattle


Cluster #4: 10 items
	Bitter Lake, Seattle

	Northgate, Seattle

	Haller Lake, Seattle

	Pinehurst, Seattle

	North College Park, Seattle


Cluster #8: 9 items
	Roosevelt, Seattle

	Ravenna, Seattle

	University District, Seattle

	Wallingford, Seattle

	Green Lake, Seattle


Cluster #6: 7 items
	Downtown, Seattle

	Pioneer Square, Seattle

	International District, Seattle

	Waterfront, Seattle

	Chinatown, San Francisco


Cluster #9: 7 items
	Rainier View, Seattle

	Bayview, San Francisco

	Excelsior, San Francisco

	Oceanview, San Francisco

	Outer Mission, San Francisco


Cluster #2: 6 items
	Broadview, Seattle

	Gatewood, Seattle

	Fauntleroy, Seattle

	Arbor Heights, Seattle

	Lakeshore, San Francisco


Plotting using 3 first dimensions


In [347]:
fig = plt.figure("", figsize=(10, 10))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

plt.cla()
labels = km.labels_

ax.scatter(nbrs_features_reduced[:, 0], nbrs_features_reduced[:, 1], nbrs_features_reduced[:, 2], c=labels.astype(np.float))

ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])


Out[347]:
[]

Persisting the Models


In [354]:
joblib.dump(min_max_scaler, 'data/output/min_max_scaler.pkl')
joblib.dump(lsa, 'data/output/lsa.pkl')
joblib.dump(vectorizer, 'data/output/vectorizer.pkl')
joblib.dump(vectorizer, 'data/output/vectorizer.pkl')


Out[354]:
['data/output/vectorizer.pkl']

In [ ]: