(c) Karen Belita
Team Neighborhood Change
Last Updated 9/7/2016

NOTEBOOK: CLUSTER VISUALIZATIONS

To create a few visualizations to see which features to use for machine learning - unsupervised learning

OUTPUT

  • visualizations
  • csv file is created for each cluster

DEPENDENCIES

indicators_clustered_completed.csv -- which has a list of all features imputted with cluster assignment


In [70]:
import warnings
import pandas as pd
import os 
import csv
import xlrd
import numpy as np
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler,RobustScaler, Normalizer


import matplotlib
matplotlib.style.use('ggplot')

%matplotlib inline

PART 1

  • read indicators.csv

In [71]:
df = pd.read_csv('indicators_clustered_completed.csv')

In [72]:
df.head()


Out[72]:
Unnamed: 0 cbsa msa_name cluster violent_crime_rate murder_manslaughter rape robbery aggravated_assault property_crime_rate ... pct_laccess_hhnv10 event_mpmt fatalities_mpmt injuries_mpmt walk_score transit_score bike_score unemploymentrate employment laborforce
0 0 35620 New York-Newark-Jersey City NY-NJ-PA Metro Are... 5 371.4 3.1 16.6 135.3 216.3 1609.9 ... 0.338235 0.418116 0.005243 0.396239 74.500000 44.833333 30.333333 6.40 9324454.0 9965273.0
1 1 31080 Los Angeles-Long Beach-Anaheim CA Metro Area ... 2 368.9 4.4 21.6 136.6 206.2 2050.4 ... 0.224680 0.298524 0.004415 0.345391 60.052632 19.368421 24.000000 6.75 6932227.5 7426637.5
2 2 16980 Chicago-Naperville-Elgin IL-IN-WI Metro Area ... 7 380.1 6.3 28.2 143.3 202.2 2135.4 ... 0.473720 0.579050 0.004533 0.708461 49.700000 20.300000 7.000000 7.10 4540001.0 4888002.0
3 3 19100 Dallas-Fort Worth-Arlington TX Metro Area ... 13 332.9 4.4 31.8 120.8 175.8 3036.4 ... 0.650693 0.494725 0.006083 0.583938 38.714286 19.714286 26.285714 5.00 3357271.0 3534675.0
4 4 26420 Houston-The Woodlands-Sugar Land TX Metro Area... 1 567.4 5.8 33.2 232.2 296.1 3208.1 ... 3.322921 0.492627 0.001631 0.724259 29.000000 7.400000 9.800000 5.00 3079095.0 3239227.0

5 rows × 44 columns


In [73]:
df = df[['cbsa', 'msa_name', 'cluster','violent_crime_rate', 'murder_manslaughter', 'rape', 'robbery', 'aggravated_assault', 'property_crime_rate', 'burglary', 'larceny_theft', 'motor_vehicle_theft', 'total_crime_rate', 'median_gross_rent', 'median_monthly_mortgage', 'rent_burden', 'mortgage_burden', 'income_change_2012_to_2014', 'median_age_of_men', 'median_age_of_women', 'median_age', 'median_household_income', 'single_men_population', 'single_women_population', 'ratio_of_single_men_to_single_women', 'population_percent_of_single_men', 'population_percent_of_single_women', 'population', 'edu_average_scale_score', 'pct_laccess_pop10', 'pct_laccess_lowi10', 'pct_laccess_child10', 'pct_laccess_seniors10', 'pct_laccess_hhnv10', 'event_mpmt', 'fatalities_mpmt', 'injuries_mpmt', 'walk_score', 'transit_score', 'bike_score', 'unemploymentrate', 'employment', 'laborforce']]

In [74]:
df.columns.values


Out[74]:
array(['cbsa', 'msa_name', 'cluster', 'violent_crime_rate',
       'murder_manslaughter', 'rape', 'robbery', 'aggravated_assault',
       'property_crime_rate', 'burglary', 'larceny_theft',
       'motor_vehicle_theft', 'total_crime_rate', 'median_gross_rent',
       'median_monthly_mortgage', 'rent_burden', 'mortgage_burden',
       'income_change_2012_to_2014', 'median_age_of_men',
       'median_age_of_women', 'median_age', 'median_household_income',
       'single_men_population', 'single_women_population',
       'ratio_of_single_men_to_single_women',
       'population_percent_of_single_men',
       'population_percent_of_single_women', 'population',
       'edu_average_scale_score', 'pct_laccess_pop10',
       'pct_laccess_lowi10', 'pct_laccess_child10',
       'pct_laccess_seniors10', 'pct_laccess_hhnv10', 'event_mpmt',
       'fatalities_mpmt', 'injuries_mpmt', 'walk_score', 'transit_score',
       'bike_score', 'unemploymentrate', 'employment', 'laborforce'], dtype=object)

In [ ]:


In [ ]:

PART 2:

  • Plotting
  • creating csv files for grouped cluser that will be used in the plotly online application

In [30]:
import plotly.plotly as py
import cufflinks as cf
import pandas as pd


py.sign_in('karen.belita', 'eclu33vmlw')

bubbles_mpl = plt.figure()

cf.set_config_file(offline=False, world_readable=True, theme='pearl')


df.iplot(kind='bubble', x= 'total_crime_rate', y='rent_burden', size= 'population', 
             xTitle='total crime rate', yTitle='rent burden',
             filename='clusters')


High five! You successfuly sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~karen.belita/0 or inside your plot.ly account where it is named 'clusters'
Out[30]:
<matplotlib.figure.Figure at 0x1178afa10>

scale down population


In [77]:
df['population'].describe()


Out[77]:
count    3.000000e+02
mean     8.894465e+05
std      1.808600e+06
min      1.397230e+05
25%      1.948065e+05
50%      3.407890e+05
75%      7.324810e+05
max      2.009288e+07
Name: population, dtype: float64

In [79]:
df['population'] = (df['population']/100000)

data per cluster


In [80]:
import plotly.plotly as py
import plotly.graph_objs as go

import pandas as pd
import math



trace0 = go.Scatter(x=df['employment'][df['cluster'] == 0],
                    y=df['walk_score'][df['cluster'] == 0],
                    mode='markers',
                    name='Cluster 0',
                    text=df['msa_name'][df['cluster'] == 0],
                    marker=dict(symbol='circle',
                                sizemode='diameter',
                                sizeref=0.85,
                                size=df['population'][df['cluster'] == 0],
                                line=dict(width=2),))

trace1 = go.Scatter(x=df['employment'][df['cluster'] == 1],
                    y=df['walk_score'][df['cluster'] == 1],
                    mode='markers',
                    name='Cluster 1',
                    text=df['msa_name'][df['cluster'] == 1],
                    marker=dict(symbol='circle',
                                sizemode='diameter',
                                sizeref=0.85,
                                size=df['population'][df['cluster'] == 1],
                                line=dict(width=2),))


trace2 = go.Scatter(x=df['employment'][df['cluster'] == 2],
                    y=df['walk_score'][df['cluster'] == 2],
                    mode='markers',
                    name='Cluster 2',
                    text=df['msa_name'][df['cluster'] == 2],
                    marker=dict(symbol='circle',
                                sizemode='diameter',
                                sizeref=0.85,
                                size=df['population'][df['cluster'] == 2],
                                line=dict(width=2),))

trace3 = go.Scatter(x=df['employment'][df['cluster'] == 3],
                    y=df['walk_score'][df['cluster'] == 3],
                    mode='markers',
                    name='Cluster 3',
                    text=df['msa_name'][df['cluster'] == 3],
                    marker=dict(symbol='circle',
                                sizemode='diameter',
                                sizeref=0.85,
                                size=df['population'][df['cluster'] == 3],
                                line=dict(width=2),))

trace4= go.Scatter(x=df['employment'][df['cluster'] == 4],
                    y=df['walk_score'][df['cluster'] == 4],
                    mode='markers',
                    name='Cluster 4',
                    text=df['msa_name'][df['cluster'] == 4],
                    marker=dict(symbol='circle',
                                sizemode='diameter',
                                sizeref=0.85,
                                size=df['population'][df['cluster'] == 4],
                                line=dict(width=2),))


trace5= go.Scatter(x=df['employment'][df['cluster'] == 5],
                    y=df['walk_score'][df['cluster'] == 5],
                    mode='markers',
                    name='Cluster 5',
                    text=df['msa_name'][df['cluster'] == 5],
                    marker=dict(symbol='circle',
                                sizemode='diameter',
                                sizeref=0.85,
                                size=df['population'][df['cluster'] == 5],
                                line=dict(width=2),))



trace6= go.Scatter(x=df['employment'][df['cluster'] == 6],
                    y=df['walk_score'][df['cluster'] == 6],
                    mode='markers',
                    name='Cluster 6',
                    text=df['msa_name'][df['cluster'] == 6],
                    marker=dict(symbol='circle',
                                sizemode='diameter',
                                sizeref=0.85,
                                size=df['population'][df['cluster'] == 6],
                                line=dict(width=2),))



trace7= go.Scatter(x=df['employment'][df['cluster'] == 7],
                    y=df['walk_score'][df['cluster'] == 7],
                    mode='markers',
                    name='Cluster 7',
                    text=df['msa_name'][df['cluster'] == 7],
                    marker=dict(symbol='circle',
                                sizemode='diameter',
                                sizeref=0.85,
                                size=df['population'][df['cluster'] == 7],
                                line=dict(width=2),))



trace8= go.Scatter(x=df['employment'][df['cluster'] == 8],
                    y=df['walk_score'][df['cluster'] == 8],
                    mode='markers',
                    name='Cluster 8',
                    text=df['msa_name'][df['cluster'] == 8],
                    marker=dict(symbol='circle',
                                sizemode='diameter',
                                sizeref=0.85,
                                size=df['population'][df['cluster'] == 8],
                                line=dict(width=2),))




trace9= go.Scatter(x=df['employment'][df['cluster'] == 9],
                    y=df['walk_score'][df['cluster'] == 9],
                    mode='markers',
                    name='Cluster 9',
                    text=df['msa_name'][df['cluster'] == 9],
                    marker=dict(symbol='circle',
                                sizemode='diameter',
                                sizeref=0.85,
                                size=df['population'][df['cluster'] == 9],
                                line=dict(width=2),))


trace10= go.Scatter(x=df['employment'][df['cluster'] == 10],
                    y=df['walk_score'][df['cluster'] == 10],
                    mode='markers',
                    name='Cluster 10',
                    text=df['msa_name'][df['cluster'] == 10],
                    marker=dict(symbol='circle',
                                sizemode='diameter',
                                sizeref=0.85,
                                size=df['population'][df['cluster'] == 10],
                                line=dict(width=2),))




trace11 = go.Scatter(x=df['employment'][df['cluster'] == 11],
                    y=df['walk_score'][df['cluster'] == 11],
                    mode='markers',
                    name='Cluster 11',
                    text=df['msa_name'][df['cluster'] == 11],
                    marker=dict(symbol='circle',
                                sizemode='diameter',
                                sizeref=0.85,
                                size=df['population'][df['cluster'] == 11],
                                line=dict(width=2),))


trace12 = go.Scatter(x=df['employment'][df['cluster'] == 12],
                    y=df['walk_score'][df['cluster'] == 12],
                    mode='markers',
                    name='Cluster 12',
                    text=df['msa_name'][df['cluster'] == 12],
                    marker=dict(symbol='circle',
                                sizemode='diameter',
                                sizeref=0.85,
                                size=df['population'][df['cluster'] == 12],
                                line=dict(width=2),))

trace13 = go.Scatter(x=df['employment'][df['cluster'] == 13],
                    y=df['walk_score'][df['cluster'] == 13],
                    mode='markers',
                    name='Cluster 13',
                    text=df['msa_name'][df['cluster'] == 13],
                    marker=dict(symbol='circle',
                                sizemode='diameter',
                                sizeref=0.85,
                                size=df['population'][df['cluster'] == 13],
                                line=dict(width=2),))

trace14= go.Scatter(x=df['employment'][df['cluster'] == 14],
                    y=df['walk_score'][df['cluster'] == 14],
                    mode='markers',
                    name='Cluster 14',
                    text=df['msa_name'][df['cluster'] == 14],
                    marker=dict(symbol='circle',
                                sizemode='diameter',
                                sizeref=0.85,
                                size=df['population'][df['cluster'] == 14],
                                line=dict(width=2),))


trace15= go.Scatter(x=df['employment'][df['cluster'] == 15],
                    y=df['walk_score'][df['cluster'] == 15],
                    mode='markers',
                    name='Cluster 15',
                    text=df['msa_name'][df['cluster'] == 15],
                    marker=dict(symbol='circle',
                                sizemode='diameter',
                                sizeref=0.85,
                                size=df['population'][df['cluster'] == 15],
                                line=dict(width=2),))






data = [trace0,trace1, trace2, trace3, trace4,trace5,trace6,trace7,trace8,trace9,trace10,trace11,trace12,trace13,trace14,trace15]
layout = go.Layout(
title='Indicators and Clusters',
xaxis=dict(title='Employment',
           gridcolor='rgb(255, 255, 255)',
           type='log',
           zerolinewidth=1,
           ticklen=5,
           gridwidth=2,),
yaxis=dict(title='Walkability',
           gridcolor='rgb(255, 255, 255)',
           zerolinewidth=1,
           ticklen=5,
           gridwidth=2,),
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)',)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='INDICATORS_CLUSTERS')


Out[80]:

In [ ]:


In [ ]:


In [ ]: