In [487]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import Normalizer, StandardScaler


from folium import plugins

import matplotlib.pyplot as plt

#sql stuff:
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [550]:
#FUNCTIONS:

def get_nbds(new_descp):
    """
    builds a score for each neighborhood given a description as follows:
    ass up the distances
    """
    neighbors = knn.kneighbors(model.transform([new_descp]))
    closest_listings = neighbors[1][0]
    results = train.iloc[closest_listings][["neighbourhood_cleansed"]]
    results["distance"] = neighbors[0][0]

    #invert the distance:
    results["distance"] = results["distance"].max() + 1 - results["distance"]
    nbd_score = results.groupby("neighbourhood_cleansed")["distance"].sum().sort_values(ascending = False)


    nbd_score = pd.concat((nbd_score, nbd_counts), 1)
    nbd_score["weighted_score"] = nbd_score["distance"]/np.log(nbd_score["neighbourhood_cleansed"])

    return nbd_score

def locations_of_best_match(new_descp):
    neighbors = knn.kneighbors(model.transform([new_descp]))
    closest_listings = neighbors[1][0]
    results = train.iloc[closest_listings]
    return results


def draw_point_map(results, nr_pts = 300):
    map_osm = folium.Map(tiles='Cartodb Positron', location = [40.661408, -73.961750])
    #this is stupidly slow:
    for index, row in results[:nr_pts].iterrows():
        folium.CircleMarker(location=[row["latitude"], row["longitude"]], radius=row["latitude"], color = "pink").add_to(map_osm)

    return(map_osm)



def get_heat_map(descp):
    map_osm = folium.Map(tiles='Cartodb Positron', location = [40.7831, -73.970], zoom_start=13)
    results = locations_of_best_match(descp)
    temp = results[["latitude", "longitude"]].values.tolist()
    

    map_osm.add_children(plugins.HeatMap(temp, min_opacity = 0.4, radius = 20, blur = 30,
                                         gradient = return_color_scale(scale_1),
                                         name = descp))
    
    
    folium.LayerControl().add_to(mapa)
    return map_osm

In [626]:
dbname = 'airbnb_db'
username = 'alexpapiu'
con = psycopg2.connect(database = dbname, user = username)
train = pd.read_sql_query("SELECT * FROM location_descriptions", con)

nbd_counts = train["neighbourhood_cleansed"].value_counts()

descp = train[["id", "neighborhood_overview"]]

descp = descp.drop_duplicates()

In [627]:
train


Out[627]:
id neighborhood_overview neighbourhood_cleansed city latitude longitude
0 685006.0 The apartment is right across the street from ... Prospect-Lefferts Gardens Brooklyn 40.661408 -73.961750
1 9461238.0 Prime Williamsburg, the hot South side: Steps ... Williamsburg Brooklyn 40.716320 -73.957255
2 4873690.0 Long Island City is the ultimate, in my mind, ... Long Island City Queens 40.742824 -73.949939
3 14179829.0 Located on a beautiful, quiet tree lined stree... Sunnyside Queens 40.741264 -73.925798
4 7364064.0 This spacious three bedroom is located in the ... Stuyvesant Town New York 40.733109 -73.979277
5 4349358.0 The apartment is located in a beautiful, safe,... Upper West Side New York 40.774744 -73.978863
6 11165126.0 Architecture, art, and fine dining meets a thr... Chelsea New York 40.738694 -73.992643
7 16014791.0 I love the convenient of walking distance to e... Hell's Kitchen New York 40.755364 -73.999498
8 1727923.0 Williamsburg is the most dynamic neighborhood ... Williamsburg Brooklyn 40.713157 -73.956484
9 3562906.0 This is centrally located in midtown. Times Sq... Hell's Kitchen New York 40.766326 -73.990984
10 8699290.0 Around the Corner on 111th (btw 2nd & 3rd) the... East Harlem New York 40.792881 -73.939266
11 3431816.0 The apartment is located in the heart of Crown... Crown Heights Brooklyn 40.671829 -73.954914
12 3038614.0 My neighborhood is great because you can walk ... Bedford-Stuyvesant Brooklyn 40.685964 -73.958375
13 15359364.0 Williamsburg is one of the most desirable plac... Williamsburg Brooklyn 40.709073 -73.949885
14 14048645.0 Views ..Shopping..Restaurants... Transportatio... Bay Ridge Brooklyn 40.621579 -74.020155
15 8281370.0 The park, my gym, the food, strong immigrant c... Sunset Park Brooklyn 40.645446 -74.014001
16 9429904 The Food. The Energy. The Architecture. West Village New York 40.729930 -74.003363
17 661847.0 The Lower East Side is a vibrant magical neigh... Lower East Side New York 40.719669 -73.987067
18 15097204.0 Lower east side is famous for it's edgy vibe o... Lower East Side New York 40.718824 -73.984882
19 16162621.0 Brunch and bike away the day, and lounge and d... Williamsburg Brooklyn 40.713063 -73.948557
20 14572907.0 East Village location is great for restaurants... East Village New York 40.729614 -73.981422
21 15359350.0 I love the access in the neighborhood - there ... Ridgewood New York 40.699343 -73.908851
22 6184285.0 Just a few blocks to the beach, restaurants an... Arverne Queens 40.594830 -73.788404
23 1989972.0 Close to Museums,Central Park and East River w... East Harlem New York 40.788134 -73.944947
24 3693901.0 It's calm nice area but just short walk to Bed... Bedford-Stuyvesant brooklyn 40.691344 -73.944680
25 5373703.0 There is no shortage of excellent bars, restau... Clinton Hill Brooklyn 40.689432 -73.966349
26 677162.0 Riverside park is wonderful and the largest ca... Morningside Heights New York 40.806796 -73.967730
27 261869.0 The neighborhood has history, character and th... Lower East Side New York 40.720875 -73.988021
28 1834738.0 Prime location, walking distance to East and W... Chelsea New York 40.738647 -73.996204
29 4718314.0 Bed-Stuy is an up-and-coming neighbourhood wit... Bedford-Stuyvesant Brooklyn, New York 40.678933 -73.952362
... ... ... ... ... ... ...
28194 11016265.0 The neighborhood of Nopa is a very desired and... Western Addition San Francisco 37.776557 -122.445320
28195 4773949.0 The apartment is close to so many popular San ... Western Addition San Francisco 37.786112 -122.437043
28196 13359153.0 It is safety and quiet studio in a great high-... Western Addition San Francisco 37.780257 -122.434062
28197 9315067.0 The Western addition is centrally located in t... Western Addition San Francisco 37.780377 -122.432079
28198 890901.0 Located in the heart of Haight! The location ... Western Addition San Francisco 37.770238 -122.436205
28199 7278586.0 Located in Alamo Square near Hayes Valley you ... Western Addition San Francisco 37.777592 -122.433548
28200 1682072.0 Centrally located is of the safest neighborhoo... Western Addition San Francisco 37.782775 -122.443132
28201 12294106.0 The Castro, Union Square, Fisherman's Warf, BA... Western Addition San Francisco 37.775112 -122.422682
28202 1103797.0 Close to loads of nice restaurants, shops and ... Western Addition San Francisco 37.785206 -122.442787
28203 5258355.0 The neighborhood is definitely a perk! - Gro... Western Addition San Francisco 37.776322 -122.446189
28204 6575965.0 One block down and you're in one of the most b... Western Addition San Francisco 37.773001 -122.437430
28205 11016248.0 The neighborhood of Nopa is a very desired and... Western Addition San Francisco 37.776658 -122.445012
28206 1627946.0 The Lower Haight was recently voted the best n... Western Addition San Francisco 37.774710 -122.431518
28207 12212013.0 Our home is located in the heart of San Franci... Western Addition San Francisco 37.776175 -122.432171
28208 6729923.0 Sunny, easily accessible and central Hayes Val... Western Addition San Francisco 37.775824 -122.423508
28209 808660.0 Our area called Lower Pacific Heights is one o... Western Addition San Francisco 37.784603 -122.443887
28210 7227859.0 This home is located in one of the most presti... Western Addition San Francisco 37.786111 -122.440192
28211 4616783.0 We are located smack-dab in the middle of some... Western Addition San Francisco 37.774216 -122.427054
28212 3319447.0 Cole Valley is a very safe, charming and frien... Golden Gate Park San Francisco 37.766516 -122.456014
28213 2694526.0 THE NEIGHBORHOOD The Inner Richmond is a charm... Golden Gate Park San Francisco 37.772556 -122.458669
28214 10936828.0 Location!!! Golden Gate Park San Francisco 37.765627 -122.483467
28215 254953.0 Fabulous Oceanfront location next to Golden Ga... Golden Gate Park San Francisco 37.770271 -122.510905
28216 2397858.0 Sleepy local safe seaside residential area whe... Golden Gate Park San Francisco 37.770261 -122.499015
28217 9318994.0 This neighborhood is San Francisco's best kept... Golden Gate Park San Francisco 37.768756 -122.512968
28218 12753688.0 FABULOUS location location location no barbecu... Golden Gate Park San Francisco 37.770660 -122.511687
28219 13661411.0 The best thing about this location is that it ... Ocean View Daly City 37.704522 -122.464643
28220 13452434.0 The street is safe and peaceful. Walking dista... Crocker Amazon Daly City 37.707406 -122.450857
28221 13661054.0 The best thing about this location is that it ... Ocean View Daly City 37.706395 -122.463281
28222 13661869.0 The best thing about this location is that it ... Ocean View Daly City 37.705302 -122.463284
28223 13493466.0 The Ballpark, the restaurants, proximity to th... South of Market San Francisco 37.780753 -122.385530

28224 rows × 6 columns


In [632]:
#MODEL:
#spell check:
model = make_pipeline(TfidfVectorizer(stop_words = "english", min_df = 5, ngram_range = (1,1)),
                      TruncatedSVD(100),
                      Normalizer())

knn = NearestNeighbors(500, metric = "cosine", algorithm = "brute")

X = descp["neighborhood_overview"]
X_proj = model.fit_transform(X)
knn.fit(X_proj)


Out[632]:
NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=500, p=2, radius=1.0)

In [596]:
model


Out[596]:
Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_i...00, n_iter=5,
       random_state=None, tol=0.0)), ('normalizer', Normalizer(copy=True, norm='l2'))])

In [602]:
from sklearn.externals import joblib
joblib.dump(model, 'tf_idf_model.pkl')


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-602-79f2dfeb3a96> in <module>()
      1 from sklearn.externals import joblib
      2 joblib.dump(model, 'tf_idf_model.pkl')
----> 3 knn.dump(knn, 'knn.pkl')

AttributeError: 'NearestNeighbors' object has no attribute 'dump'

In [600]:
model = joblib.load('tf_idf_model.pkl')

In [636]:
descp = " trendy hip cool"

In [637]:
nbd_score = get_nbds(descp)

In [643]:
from pylab import rcParams
rcParams['figure.figsize'] = 3, 5
nbd_score["weighted_score"].dropna().sort_values().tail(12).plot(kind = "barh")


Out[643]:
<matplotlib.axes._subplots.AxesSubplot at 0x12fddf400>

In [647]:
(pd.Series([90.4, 53, 50.5, 50.3, 49.1, 48],
           index = ["Baseline", "Ridge Regression", "Random Forest", "NN", "xgboost", "NN + xgboost"]).sort_values()
.plot(kind = "barh", title = "Root Mean Squared Error in Dollars on Test Set (smaller is better)"))


Out[647]:
<matplotlib.axes._subplots.AxesSubplot at 0x14e53ba90>

In [215]:
results = locations_of_best_match(descp)

In [569]:
get_heat_map("hip urban gritty")

In [521]:
def add_heat_layer(mapa, descp, scale  = scale_2):

    results = locations_of_best_match(descp)
    temp = results[["latitude", "longitude"]].values.tolist()

    mapa.add_children(plugins.HeatMap(temp, min_opacity = 0.4, radius = 30, blur = 30,
                                      gradient = return_color_scale(scale),
                                      name = descp))
    return mapa

In [522]:
scale_2 = ["#f2eff1", "#f2eff1", "#3E4A89", "#31688E", "#26828E", "#1F9E89", "#35B779",
               "#6DCD59", "#B4DE2C", "#FDE725"]

scale_1 = ["#f2eff1", "#f2eff1", "#451077", "#721F81", "#9F2F7F", "#CD4071",
           "#F1605D",  "#FD9567",  "#FEC98D", "#FCFDBF"]

scale_3 = ["#f2eff1", "#f2eff1", "#4B0C6B", "#781C6D", "#A52C60", "#CF4446",
           "#ED6925", "#FB9A06", "#F7D03C", "#FCFFA4"]

def return_color_scale(scale):
    df = pd.Series(scale)
    df.index = np.power(df.index/10, 1/2)
    return df.to_dict()

In [634]:
mapa = get_heat_map("hip cute thrift stores bars")
add_heat_layer(mapa, "gritty authentic")
add_heat_layer(mapa, "chinese", scale = scale_3)

folium.LayerControl().add_to(mapa)


Out[634]:
<folium.map.LayerControl at 0x14d7984e0>

In [615]:
#normalize the heat map
#take distance into account.
#datashader 
mapa = get_heat_map("russian")

In [635]:
mapa


Out[635]:

In [588]:
mapa.save("map_test")

In [589]:
ls


Thumbnails/
clean_listings.csv
dedup_listings.csv
map_test
mock_clean_data.csv
neighbourhoods.geojson
new-york-city_2015-01-01_data_listings.csv
new-york-city_2015-12-02_data_listings.csv
new-york-city_2016-12-03_data_listings.csv
new-york-city_2016-12-03_data_listings.csv.gz
san-francisco_2016-07-02_data_listings.csv
san-francisco_2016-07-02_data_listings.csv.gz
sm_listings.csv

In [515]:
def get_heat_map(descp):
    map_osm = folium.Map(tiles='Cartodb Positron', location = [40.7831, -73.970], zoom_start=13)
    results = locations_of_best_match(descp)
    temp = results[["latitude", "longitude"]].values.tolist()
    

    map_osm.add_children(plugins.HeatMap(temp, min_opacity = 0.4, radius = 30, blur = 30,
                                         gradient = return_color_scale(scale_1),
                                         name = descp))
    
    
    folium.LayerControl().add_to(mapa)
    return map_osm

In [604]:
get_heat_map("cocktail")


Out[604]:

In [560]:
danger = train[train.neighborhood_overview.str.contains("dangerous")]

In [562]:
danger.neighborhood_overview.iloc[1]


Out[562]:
'Bed Stuy has a diverse mix of students, families, hipsters, artists and creative professionals.  Bed Stuy has a strong community and abundent attractive brownstone houses and tree lined streets.  Conveniently located 1 block from the subway you can be in Manhattan in 20 - 30 mins door to door. This is a very racially diverse part of Brooklyn - If you are at all narrow minded and or equate minorities with being dangerous than this is not the house for you.  We under no circumstance tolerate that kind of behavior.  Our doors are open to all nationalities and sexual orientation.  Everyone is welcome here regardless of race - creed - color - age or sexual orientation.'

In [631]:
get_heat_map("close to central park")


Out[631]:

In [633]:
get_heat_map("close to central park")


Out[633]:

In [586]:
import folium
map_osm = folium.Map(location=[45.5236, -122.6750])

In [590]:
m.create_map(path='map.html')


//anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: Map.create_map is deprecated. Use Map.save instead
  if __name__ == '__main__':

In [593]:
map_osm = folium.Map(location=[45.5236, -122.6750])