In [14]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import Normalizer, StandardScaler

import folium
from folium import plugins

import matplotlib.pyplot as plt

#sql stuff:
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
#FUNCTIONS:

def get_nbds(new_descp):
    """
    builds a score for each neighborhood given a description as follows:
    ass up the distances
    """
    neighbors = knn.kneighbors(model.transform([new_descp]))
    closest_listings = neighbors[1][0]
    results = train.iloc[closest_listings][["neighbourhood_cleansed"]]
    results["distance"] = neighbors[0][0]

    #invert the distance:
    results["distance"] = results["distance"].max() + 1 - results["distance"]
    nbd_score = results.groupby("neighbourhood_cleansed")["distance"].sum().sort_values(ascending = False)


    nbd_score = pd.concat((nbd_score, nbd_counts), 1)
    nbd_score["weighted_score"] = nbd_score["distance"]/np.log(nbd_score["neighbourhood_cleansed"])

    return nbd_score

def locations_of_best_match(new_descp):
    neighbors = knn.kneighbors(model.transform([new_descp]))
    closest_listings = neighbors[1][0]
    results = train.iloc[closest_listings]
    return results


def draw_point_map(results, nr_pts = 300):
    map_osm = folium.Map(tiles='Cartodb Positron', location = [40.661408, -73.961750])
    #this is stupidly slow:
    for index, row in results[:nr_pts].iterrows():
        folium.CircleMarker(location=[row["latitude"], row["longitude"]], radius=row["latitude"], color = "pink").add_to(map_osm)

    return(map_osm)



def get_heat_map(descp):
    map_osm = folium.Map(tiles='Cartodb Positron', location = [40.7831, -73.970], zoom_start=13)
    results = locations_of_best_match(descp)
    temp = results[["latitude", "longitude"]].values.tolist()
    

    map_osm.add_children(plugins.HeatMap(temp, min_opacity = 0.4, radius = 20, blur = 30,
                                         gradient = return_color_scale(scale_1),
                                         name = descp))
    
    
    folium.LayerControl().add_to(mapa)
    return map_osm

In [3]:
dbname = 'airbnb_db'
username = 'alexpapiu'
con = psycopg2.connect(database = dbname, user = username)
train = pd.read_sql_query("SELECT * FROM location_descriptions", con)

nbd_counts = train["neighbourhood_cleansed"].value_counts()

descp = train[["id", "neighborhood_overview"]]

descp = descp.drop_duplicates()

In [4]:
#MODEL:
model = make_pipeline(TfidfVectorizer(stop_words = "english", min_df = 5, ngram_range = (1,2)),
                      TruncatedSVD(100),
                      Normalizer())

knn = NearestNeighbors(500, metric = "cosine", algorithm = "brute")


X = descp["neighborhood_overview"]
X_proj = model.fit_transform(X)
knn.fit(X_proj)


Out[4]:
NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=500, p=2, radius=1.0)

In [5]:
model


Out[5]:
Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_i...00, n_iter=5,
       random_state=None, tol=0.0)), ('normalizer', Normalizer(copy=True, norm='l2'))])

In [7]:
from sklearn.externals import joblib
joblib.dump(model, 'tf_idf_model.pkl')


Out[7]:
['tf_idf_model.pkl',
 'tf_idf_model.pkl_01.npy',
 'tf_idf_model.pkl_02.npy',
 'tf_idf_model.pkl_03.npy',
 'tf_idf_model.pkl_04.npy',
 'tf_idf_model.pkl_05.npy']

In [8]:
model = joblib.load('tf_idf_model.pkl')

In [9]:
descp = "jamaican cusine culture gritty"

In [10]:
nbd_score = get_nbds(descp)

In [11]:
nbd_score["weighted_score"].dropna().sort_values().tail(12).plot(kind = "barh")


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x1130afa20>

In [12]:
results = locations_of_best_match(descp)

In [13]:
get_heat_map("hip urban gritty")


-------------------------------------------------------------------------
NameError                               Traceback (most recent call last)
<ipython-input-13-c2e92af0171d> in <module>()
----> 1 get_heat_map("hip urban gritty")

<ipython-input-2-affda8346aa3> in get_heat_map(descp)
     39 
     40 def get_heat_map(descp):
---> 41     map_osm = folium.Map(tiles='Cartodb Positron', location = [40.7831, -73.970], zoom_start=13)
     42     results = locations_of_best_match(descp)
     43     temp = results[["latitude", "longitude"]].values.tolist()

NameError: name 'folium' is not defined

In [521]:
def add_heat_layer(mapa, descp, scale  = scale_2):

    results = locations_of_best_match(descp)
    temp = results[["latitude", "longitude"]].values.tolist()

    mapa.add_children(plugins.HeatMap(temp, min_opacity = 0.4, radius = 30, blur = 30,
                                      gradient = return_color_scale(scale),
                                      name = descp))
    return mapa

In [522]:
scale_2 = ["#f2eff1", "#f2eff1", "#3E4A89", "#31688E", "#26828E", "#1F9E89", "#35B779",
               "#6DCD59", "#B4DE2C", "#FDE725"]

scale_1 = ["#f2eff1", "#f2eff1", "#451077", "#721F81", "#9F2F7F", "#CD4071",
           "#F1605D",  "#FD9567",  "#FEC98D", "#FCFDBF"]

scale_3 = ["#f2eff1", "#f2eff1", "#4B0C6B", "#781C6D", "#A52C60", "#CF4446",
           "#ED6925", "#FB9A06", "#F7D03C", "#FCFFA4"]

def return_color_scale(scale):
    df = pd.Series(scale)
    df.index = np.power(df.index/10, 1/2)
    return df.to_dict()

In [584]:
mapa = get_heat_map("hip cute thrift stores bars")
add_heat_layer(mapa, "gritty authentic")
add_heat_layer(mapa, "chinese", scale = scale_3)

folium.LayerControl().add_to(mapa)


Out[584]:
<folium.map.LayerControl at 0x14b55c860>

In [585]:
mapa


Out[585]:

In [588]:
mapa.save("map_test")

In [589]:
ls


Thumbnails/
clean_listings.csv
dedup_listings.csv
map_test
mock_clean_data.csv
neighbourhoods.geojson
new-york-city_2015-01-01_data_listings.csv
new-york-city_2015-12-02_data_listings.csv
new-york-city_2016-12-03_data_listings.csv
new-york-city_2016-12-03_data_listings.csv.gz
san-francisco_2016-07-02_data_listings.csv
san-francisco_2016-07-02_data_listings.csv.gz
sm_listings.csv

In [515]:
def get_heat_map(descp):
    map_osm = folium.Map(tiles='Cartodb Positron', location = [40.7831, -73.970], zoom_start=13)
    results = locations_of_best_match(descp)
    temp = results[["latitude", "longitude"]].values.tolist()
    

    map_osm.add_children(plugins.HeatMap(temp, min_opacity = 0.4, radius = 30, blur = 30,
                                         gradient = return_color_scale(scale_1),
                                         name = descp))
    
    
    folium.LayerControl().add_to(mapa)
    return map_osm

In [604]:
get_heat_map("cocktail")


Out[604]:

In [560]:
danger = train[train.neighborhood_overview.str.contains("dangerous")]

In [562]:
danger.neighborhood_overview.iloc[1]


Out[562]:
'Bed Stuy has a diverse mix of students, families, hipsters, artists and creative professionals.  Bed Stuy has a strong community and abundent attractive brownstone houses and tree lined streets.  Conveniently located 1 block from the subway you can be in Manhattan in 20 - 30 mins door to door. This is a very racially diverse part of Brooklyn - If you are at all narrow minded and or equate minorities with being dangerous than this is not the house for you.  We under no circumstance tolerate that kind of behavior.  Our doors are open to all nationalities and sexual orientation.  Everyone is welcome here regardless of race - creed - color - age or sexual orientation.'

In [575]:
get_heat_map("fashion")


Out[575]:

In [586]:
import folium
map_osm = folium.Map(location=[45.5236, -122.6750])

In [590]:
m.create_map(path='map.html')


//anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: Map.create_map is deprecated. Use Map.save instead
  if __name__ == '__main__':

In [593]:
map_osm = folium.Map(location=[45.5236, -122.6750])