In [197]:
#Initial setup
import pandas as pd
import numpy as np
from IPython.display import Image
import plotly.plotly as py
import plotly.graph_objs as go
plotly.tools.set_credentials_file(username='crazysauce', api_key='NrGsrsXG6dSp3Ek1QnAy')
# plotly.tools.set_credentials_file(username='adam_owens', api_key='2RhvW0YButjQW520nRWu')
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
In [2]:
df_carbon = pd.read_csv('/Users/adamowens/datascience/zcarbon_v2.csv')
df_carbon.describe()
Out[2]:
In [23]:
#Key statistics
df_key = df_carbon[['z_carbon_tons','z_carb_ton_ph','carbon_tax_per_capita','tax_per_income']]
df_key = df_key.rename(columns={"z_carbon_tons": "C_Tons", "z_carb_ton_ph": "C_Tons_PH","carbon_tax_per_capita": "C_Tax_capita", "tax_per_income": "C_Tax_per_inc"})
df_key.describe()
Out[23]:
In [3]:
#Data Prep
df_carbon['z_id'] = df_carbon['z_id'].apply(lambda x: str(x))
df_carbon['z_carbon'] = df_carbon['z_carb_ton_ph'].apply(lambda x: "%.3f" % x)
df_carbon['text'] = 'Zip Code '+df_carbon['z_id'] + ' <br> '+(df_carbon['z_carbon']).astype(str)+' Carbon Tons'
In [178]:
#### //////// Create Map //////// ####
colors = ['#77AF9C','#4ea1d3','#FFBC42','#D81159']
limits = [(0,8),(8,10),(10,12),(12,15)]
zip_codes = []
scale = 1
for i in range(len(limits)):
lim = limits[i]
df_sub = df_carbon.ix[(df_carbon['z_carb_ton_ph'] > lim[0]) &
(df_carbon['z_carb_ton_ph'] < lim[1])]
zip_code = dict(
type = 'scattergeo',
locationmode = 'USA-states',
lon = df_sub['z_long_d'],
lat = df_sub['z_lat_d'],
text = df_sub['text'],
marker = dict(
size = df_sub['z_carb_ton_ph']/scale,
color = colors[i],
line = dict(width=0.5, color='rgb(40,40,40)'),
sizemode = 'area'
),
name = '{0} - {1}'.format(lim[0],lim[1]) )
zip_codes.append(zip_code)
layout = dict(
title = '2014 Carbon Tons Per Household',
showlegend = True,
width = 1100,
height = 750,
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showland = True,
landcolor = 'rgb(217, 217, 217)',
subunitcolor="rgb(255, 255, 255)",
countrycolor="rgb(255, 255, 255)"),)
fig = dict( data=zip_codes, layout=layout )
py.iplot( fig, validate=False, filename='Carbon Tons Per Household-map-populations' )
Out[178]:
In [179]:
#### //////// Similar exercise as the previous map //////// ####
df_carbon['z_carbon_tons'] = df_carbon['z_house_unit'] * df_carbon['z_carb_ton_ph']
df_carbon['text2'] = 'Zip Code '+df_carbon['z_id'] + ' <br> '+(df_carbon['z_carbon_tons']).astype(str)+' Carbon Tons'
In [180]:
def perc_carbon(n):
carbon = np.array(df_carbon['z_carbon_tons'].sort_values())
percentile = np.nanpercentile(carbon, n)
return percentile
In [181]:
limits = [(0,25),(25,50),(50,75),(75,100)]
colors = ['#77AF9C','#4ea1d3','#FFBC42','#D81159']
zip_codes = []
scale = 10000
for i in range(len(limits)):
lim = limits[i]
df_sub = df_carbon.ix[(df_carbon['z_carbon_tons'] > perc_carbon(lim[0])) &
(df_carbon['z_carbon_tons'] < perc_carbon(lim[1]))]
zip_code = dict(
type = 'scattergeo',
locationmode = 'USA-states',
lon = df_sub['z_long_d'],
lat = df_sub['z_lat_d'],
text = df_sub['text2'],
marker = dict(
size = df_sub['z_carbon_tons']/scale,
color = colors[i],
line = dict(width=0.5, color='rgb(40,40,40)'),
sizemode = 'area'
),
name = '{0} - {1}'.format(lim[0],lim[1]) )
zip_codes.append(zip_code)
layout = dict(
title = '2014 Carbon Tons<br>(By Percentile)',
showlegend = True,
width=1800,
height=1200,
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showland = True,
landcolor = 'rgb(217, 217, 217)',
subunitwidth=1,
countrywidth=1,
subunitcolor="rgb(255, 255, 255)",
countrycolor="rgb(255, 255, 255)"),)
fig = dict( data=zip_codes, layout=layout )
py.iplot( fig, validate=False, filename='carbon-tons-percentile.png' )
Out[181]:
In [182]:
#### //////// Similar exercise as the previous map //////// ####
# Model assumes $140/Carbon ton per person (annually)
tax = 140
df_carbon['total_carbon_tax']= df_carbon['z_carbon_tons'] * tax
df_carbon['carbon_tax_per_capita']= df_carbon['total_carbon_tax'] / df_carbon['z_pop']
In [183]:
def perc_carbon_pop(n):
carbon = np.array(df_carbon['carbon_tax_per_capita'].sort_values())
percentile = round(np.nanpercentile(carbon, n), 2)
return percentile
In [184]:
limits = [(perc_carbon_pop(0),perc_carbon_pop(90)),(perc_carbon_pop(90),perc_carbon_pop(95))
,(perc_carbon_pop(95),perc_carbon_pop(99)),(perc_carbon_pop(99),perc_carbon_pop(100))]
colors = ["lightgrey","black","orange","red"]
zip_codes = []
scale = 200
for i in range(len(limits)):
lim = limits[i]
df_sub = df_carbon.ix[(df_carbon['carbon_tax_per_capita'] > lim[0]) &
(df_carbon['carbon_tax_per_capita'] < lim[1])]
zip_code = dict(
type = 'scattergeo',
locationmode = 'USA-states',
lon = df_sub['z_long_d'],
lat = df_sub['z_lat_d'],
text = df_sub['text2'],
marker = dict(
size = df_sub['carbon_tax_per_capita']/scale,
color = colors[i],
line = dict(width=0.5, color='rgb(40,40,40)'),
sizemode = 'area'
),
name = '{0} - {1}'.format(lim[0],lim[1]) )
zip_codes.append(zip_code)
layout = dict(
title = '2014 Carbon Tax Per Capita <br>(By Percentile)',
showlegend = True,
width=1800,
height=1200,
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showland = True,
landcolor = 'rgb(217, 217, 217)',
subunitwidth=1,
countrywidth=1,
subunitcolor="rgb(255, 255, 255)",
countrycolor="rgb(255, 255, 255)")
,)
fig = dict( data=zip_codes, layout=layout )
py.iplot( fig, validate=False, filename='carbon_tax_140' )
Out[184]:
In [ ]:
#### //////// Similar exercise as the previous map //////// ####
# Model assumes $140/Carbon ton per person (annually)
tax = 140
df_carbon['carbon_tax_per_house']= df_carbon['z_carb_ton_ph'] * tax
In [10]:
def perc_carbon_house(n):
carbon = np.array(df_carbon['carbon_tax_per_house'].sort_values())
percentile = round(np.nanpercentile(carbon, n), 2)
return percentile
In [185]:
limits = [(perc_carbon_house(0),perc_carbon_house(25)),(perc_carbon_house(25),perc_carbon_house(50))
,(perc_carbon_house(50),perc_carbon_house(75)),(perc_carbon_house(75),perc_carbon_house(100))]
colors = ['#77AF9C','#4ea1d3','#FFBC42','#D81159']
zip_codes = []
scale = 200
for i in range(len(limits)):
lim = limits[i]
df_sub = df_carbon.ix[(df_carbon['carbon_tax_per_capita'] > lim[0]) &
(df_carbon['carbon_tax_per_capita'] < lim[1])]
zip_code = dict(
type = 'scattergeo',
locationmode = 'USA-states',
lon = df_sub['z_long_d'],
lat = df_sub['z_lat_d'],
text = df_sub['text2'],
marker = dict(
size = df_sub['z_households']/scale,
color = colors[i],
line = dict(width=0.5, color='rgb(40,40,40)'),
sizemode = 'area'
),
name = '{0} - {1}'.format(lim[0],lim[1]) )
zip_codes.append(zip_code)
layout = dict(
title = '2014 Carbon Tax Per Household (Quartiles)<br>(Sized by Population)',
showlegend = True,
width=1800,
height=1200,
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showland = True,
landcolor = 'rgb(217, 217, 217)',
subunitwidth=1,
countrywidth=1,
subunitcolor="rgb(255, 255, 255)",
countrycolor="rgb(255, 255, 255)"
),
)
fig = dict( data=zip_codes, layout=layout )
py.iplot( fig, validate=False, filename='carbon_tax_140' )
Out[185]:
In [ ]:
#### //////// Similar exercise as the previous map //////// ####
# Model assumes $140/Carbon ton per person (annually)
df_carbon['tax_per_income']= (df_carbon['carbon_tax_per_capita'] / df_carbon['z_med_inc'])*100
In [186]:
def perc_tax_income(n):
carbon = np.array(df_carbon['tax_per_income'].sort_values())
percentile = round(np.nanpercentile(carbon, n), 2)
return percentile
In [187]:
limits = [(perc_tax_income(0),perc_tax_income(25)),(perc_tax_income(25),perc_tax_income(50))
,(perc_tax_income(50),perc_tax_income(75)),(perc_tax_income(75),perc_tax_income(100))]
colors = ['#77AF9C','#4ea1d3','#FFBC42','#D81159']
zip_codes = []
scale = 1000
for i in range(len(limits)):
lim = limits[i]
df_sub = df_carbon.ix[(df_carbon['tax_per_income'] > lim[0]) &
(df_carbon['tax_per_income'] < lim[1])]
zip_code = dict(
type = 'scattergeo',
locationmode = 'USA-states',
lon = df_sub['z_long_d'],
lat = df_sub['z_lat_d'],
text = df_sub['text2'],
marker = dict(
size = df_sub['z_pop']/scale,
color = colors[i],
line = dict(width=0.5, color='rgb(40,40,40)'),
sizemode = 'area'
),
name = '{0} - {1}'.format(lim[0],lim[1]) )
zip_codes.append(zip_code)
layout = dict(
title = '2014 Carbon Tax as Percentage of Income (Quartiles)<br>(Sized by Population)',
showlegend = True,
width=1800,
height = 1200,
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showland = True,
landcolor = 'rgb(217, 217, 217)',
subunitwidth=1,
countrywidth=1,
subunitcolor="rgb(255, 255, 255)",
countrycolor="rgb(255, 255, 255)"
),
)
fig = dict( data=zip_codes, layout=layout )
py.iplot( fig, validate=False, filename='carbon_tax_140' )
In [ ]:
import plotly.plotly as py
import plotly.graph_objs as go
In [168]:
#create new variable measuring carbon tax as % of income
df_carbon['tax_per_income']= (df_carbon['carbon_tax_per_capita'] / df_carbon['z_med_inc'])*100
#for percentile analysis, not used in final presentation
def perc_tax_income_feature(feature):
perc_list = []
carbon = np.array(feature.sort_values())
percentiles = [25, 50, 75, 100]
for i in percentiles:
perc = round(np.nanpercentile(carbon, i), 3)
perc_list.append(perc)
return perc_list
tax_income_perc = perc_tax_income_feature(df_carbon['tax_per_income'])
c_tax_perc_list = [.5, 1, 1.5, 2, 3, 42.747]
In [169]:
#function to return buckets of population levels or percentiles
def ctax_income_sum(df, tax_list):
perc_sum = [0]
perc_inc = tax_list
for i in perc_inc:
pop_sum = sum(df['z_pop'].ix[(df['tax_per_income'] < i)])
perc_sum.append(pop_sum)
buckets = [y - x for x,y in zip(perc_sum,perc_sum[1:])]
return buckets
def ctax_income_perc(df, tax_list):
perc_sum = [0]
perc_inc = tax_list
for i in perc_inc:
pop_sum = sum(df['z_pop'].ix[(df['tax_per_income'] < i)])
perc_sum.append(pop_sum)
buckets = [y - x for x,y in zip(perc_sum,perc_sum[1:])]
buckets2 = [float(i) for i in buckets]
perc_buckets = [round(x*100/sum(buckets2),2) for x in buckets2]
return perc_buckets
In [170]:
income_less_25k = df_carbon.ix[df_carbon['z_med_inc'] < 25000]
income_40k = df_carbon.ix[(df_carbon['z_med_inc'] > 25000) & (df_carbon['z_med_inc'] < 40000)]
income_50k = df_carbon.ix[(df_carbon['z_med_inc'] > 40000) & (df_carbon['z_med_inc'] < 50000)]
income_60k = df_carbon.ix[(df_carbon['z_med_inc'] > 50000) & (df_carbon['z_med_inc'] < 60000)]
income_70k = df_carbon.ix[(df_carbon['z_med_inc'] > 60000) & (df_carbon['z_med_inc'] < 70000)]
income_80k = df_carbon.ix[(df_carbon['z_med_inc'] > 70000) & (df_carbon['z_med_inc'] < 80000)]
income_90k = df_carbon.ix[(df_carbon['z_med_inc'] > 80000) & (df_carbon['z_med_inc'] < 90000)]
income_100k = df_carbon.ix[(df_carbon['z_med_inc'] > 90000) & (df_carbon['z_med_inc'] < 100000)]
income_125k = df_carbon.ix[(df_carbon['z_med_inc'] > 100000) & (df_carbon['z_med_inc'] < 125000)]
income_more_125k = df_carbon.ix[df_carbon['z_med_inc'] > 125000]
In [172]:
#buckets for aggregrate population levels
inc_less_25k_sum = ctax_income_sum(income_less_25k, c_tax_perc_list)
inc_40k_sum = ctax_income_sum(income_40k, c_tax_perc_list)
inc_50k_sum = ctax_income_sum(income_50k, c_tax_perc_list)
inc_60k_sum = ctax_income_sum(income_60k, c_tax_perc_list)
inc_70k_sum = ctax_income_sum(income_70k, c_tax_perc_list)
inc_80k_sum = ctax_income_sum(income_80k, c_tax_perc_list)
inc_90k_sum = ctax_income_sum(income_90k, c_tax_perc_list)
inc_100k_sum = ctax_income_sum(income_100k, c_tax_perc_list)
inc_125k_sum = ctax_income_sum(income_125k, c_tax_perc_list)
inc_more_125k_sum = ctax_income_sum(income_more_125k, c_tax_perc_list)
#buckets for aggregrate percentages
inc_less_25k_perc = ctax_income_perc(income_less_25k, c_tax_perc_list)
inc_40k_perc = ctax_income_perc(income_40k, c_tax_perc_list)
inc_50k_perc = ctax_income_perc(income_50k, c_tax_perc_list)
inc_60k_perc = ctax_income_perc(income_60k, c_tax_perc_list)
inc_70k_perc = ctax_income_perc(income_70k, c_tax_perc_list)
inc_80k_perc = ctax_income_perc(income_80k, c_tax_perc_list)
inc_90k_perc = ctax_income_perc(income_90k, c_tax_perc_list)
inc_100k_perc = ctax_income_perc(income_100k, c_tax_perc_list)
inc_125k_perc = ctax_income_perc(income_125k, c_tax_perc_list)
inc_more_125k_perc = ctax_income_perc(income_more_125k, c_tax_perc_list)
In [173]:
#Split levels for legend bar on top
split_list = [1/6.,1/6.,1/6.,1/6.,1/6.,1/6.]
a_list = [round(x*100,2) for x in split_list]
In [174]:
#### ///////// Bar Graph for Percentages ///////// ####
top_labels = ['<0.5%','0.5-1.0%', '1.0-1.50%', '1.5-2.0%', '2.0-3.0%', '>3%']
colors = ['#77AF9C','#4ea1d3','#9055A2','#CE6D39', '#FFBC42','#D81159']
x_data = [inc_less_25k_perc, inc_40k_perc, inc_50k_perc, inc_60k_perc, inc_70k_perc, inc_80k_perc, inc_90k_perc,
inc_100k_perc, inc_125k_perc, inc_more_125k_perc, a_list]
y_data = ['Less than $25k','$25-40k','$40-50k','$50-60k','$60-70k','$70-80k','$80-90k','$90-100k','$100-125k',
'More than $125k','Carbon Tax as % of Income']
traces = []
for i in range(0, len(x_data[0])):
for xd, yd in zip(x_data, y_data):
traces.append(go.Bar(
x=xd[i],
y=yd,
orientation='h',
marker=dict(
color=colors[i],
line=dict(
color='rgb(248, 248, 249)',
width=1))))
layout = go.Layout(
xaxis=dict(
showgrid=False,
showline=False,
showticklabels=False,
zeroline=False,
domain=[0.15, 1]
),
yaxis=dict(
showgrid=False,
showline=False,
showticklabels=False,
zeroline=False,
),
barmode='stack',
plot_bgcolor='rgb(248, 248, 255)',
margin=dict(
l=5,
r=5,
t=70,
b=70
),
showlegend=False,)
annotations = []
for yd, xd in zip(y_data, x_data):
# labeling the y-axis
annotations.append(dict(xref='paper', yref='y',
x=0.14, y=yd,
xanchor='right',
text=str(yd),
font=dict(family='Arial', size=14,
color='rgb(67, 67, 67)'),
showarrow=False, align='right'))
# labeling the first percentage of each bar (x_axis)
annotations.append(dict(xref='x', yref='y',
x=xd[0] / 2, y=yd,
text=str(xd[0]) + '%',
font=dict(family='Arial', size=10,
color='rgb(248, 248, 255)'),
showarrow=False))
# labeling the first Likert scale (on the top)
if yd == y_data[-1]:
annotations.append(dict(xref='x', yref='paper',
x=xd[0] / 2, y=1.1,
text=top_labels[0],
font=dict(family='Arial', size=13,
color='rgb(67, 67, 67)'),
showarrow=False))
space = xd[0]
for i in range(1, len(xd)):
# labeling the rest of percentages for each bar (x_axis)
annotations.append(dict(xref='x', yref='y',
x=space + (xd[i]/2), y=yd,
text=str(xd[i]) + '%',
font=dict(family='Arial', size=10,
color='rgb(248, 248, 255)'),
showarrow=False))
# labeling the Likert scale
if yd == y_data[-1]:
annotations.append(dict(xref='x', yref='paper',
x=space + (xd[i]/2), y=1.1,
text=top_labels[i],
font=dict(family='Arial', size=13,
color='rgb(67, 67, 67)'),
showarrow=False))
space += xd[i]
layout['annotations'] = annotations
fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename='income-percentage')
Out[174]:
In [176]:
#sizing legend bar by the largest category
split_list = [1/6.,1/6.,1/6.,1/6.,1/6.,1/6.]
b_list = [int(round(x*sum(inc_50k_sum),0)) for x in split_list]
In [177]:
#### ///////// Bar Graph for Aggregrate Populations ///////// ####
top_labels = ['<0.5%','0.5-1.0%', '1.0-1.50%', '1.5-2.0%', '2.0-3.0%', '>3%']
colors = ['#77AF9C','#4ea1d3','#9055A2','#CE6D39', '#FFBC42','#D81159']
x_data = [inc_less_25k_sum, inc_40k_sum, inc_50k_sum, inc_60k_sum, inc_70k_sum, inc_80k_sum, inc_90k_sum, inc_100k_sum,
inc_125k_sum, inc_more_125k_sum, b_list]
y_data = ['Less than $25k','$25-40k','$40-50k','$50-60k','$60-70k','$70-80k','$80-90k','$90-100k','$100-125k',
'More than $125k','Carbon Tax as % of Income']
traces = []
for i in range(0, len(x_data[0])):
for xd, yd in zip(x_data, y_data):
traces.append(go.Bar(
x=xd[i],
y=yd,
orientation='h',
marker=dict(
color=colors[i],
line=dict(
color='rgb(248, 248, 249)',
width=1))))
layout = go.Layout(
xaxis=dict(
showgrid=False,
showline=False,
showticklabels=False,
zeroline=False,
domain=[0.15, 1]
),
yaxis=dict(
showgrid=False,
showline=False,
showticklabels=False,
zeroline=False,
),
barmode='stack',
plot_bgcolor='rgb(248, 248, 255)',
margin=dict(
l=5,
r=5,
t=70,
b=70
),
showlegend=False,)
annotations = []
for yd, xd in zip(y_data, x_data):
# labeling the y-axis
annotations.append(dict(xref='paper', yref='y',
x=0.14, y=yd,
xanchor='right',
text=str(yd),
font=dict(family='Arial', size=14,
color='rgb(67, 67, 67)'),
showarrow=False, align='right'))
# labeling the first percentage of each bar (x_axis)
annotations.append(dict(xref='x', yref='y',
x=xd[0] / 2, y=yd,
text=str(xd[0]/1000) + 'k',
font=dict(family='Arial', size=10,
color='rgb(248, 248, 255)'),
showarrow=False))
# labeling the first Likert scale (on the top)
if yd == y_data[-1]:
annotations.append(dict(xref='x', yref='paper',
x=xd[0] / 2, y=1.1,
text=top_labels[0],
font=dict(family='Arial', size=13,
color='rgb(67, 67, 67)'),
showarrow=False))
space = xd[0]
for i in range(1, len(xd)):
# labeling the rest of percentages for each bar (x_axis)
annotations.append(dict(xref='x', yref='y',
x=space + (xd[i]/2), y=yd,
text=str(xd[i]/1000) + 'k',
font=dict(family='Arial', size=10,
color='rgb(248, 248, 255)'),
showarrow=False))
# labeling the Likert scale
if yd == y_data[-1]:
annotations.append(dict(xref='x', yref='paper',
x=space + (xd[i]/2), y=1.1,
text=top_labels[i],
font=dict(family='Arial', size=13,
color='rgb(67, 67, 67)'),
showarrow=False))
space += xd[i]
layout['annotations'] = annotations
fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename='income-percentage')
Out[177]:
In [188]:
import plotly.plotly as py
from plotly.tools import FigureFactory as FF
In [ ]:
df_carb_ton = df_carbon['z_carb_ton_ph'].dropna()
In [165]:
# Caron Ton / Household PDF
fig = FF.create_distplot([df_carb_ton], [''],show_rug=False,bin_size=.05)
fig['layout'].update(title='Distibution Plot - Carbon Tons Per Household',
width= 500, height = 300,
yaxis=dict(autotick=False,
ticks='outside',
tick0=0,
dtick=0.02,
ticklen=.5,
tickwidth=1,
tickcolor='#000',
))
py.iplot(fig,filename='distplot-carbon-tax',validate=False)
Out[165]:
In [166]:
# CTax/Capita PDF
df_carb_tax = df_carbon['carbon_tax_per_capita'].ix[df_carbon['carbon_tax_per_capita'] <10000].dropna()
fig = FF.create_distplot([df_carb_tax], [''],show_rug=False,bin_size=100)
fig['layout'].update(title='Distribution Plot - Carbon Tax Per Capita',
width= 600, height = 400)
py.iplot(fig,filename='distplot-carbon-tax',validate=False)
Out[166]:
In [167]:
#CTax/Income
df_tax_income = df_carbon['tax_per_income'].ix[df_carbon['tax_per_income'] <15].dropna()
fig = FF.create_distplot([df_tax_income], [''],show_rug=False,bin_size=.05)
fig['layout'].update(title='Distribution Plot - Carbon Tax as % of Median Income',
width= 600, height = 400)
py.iplot(fig,filename='distplot-carbon-tax',validate=False)
Out[167]:
In [ ]:
from sklearn.ensemble import RandomForestClassifier
In [189]:
# Create categories from CTax/Income
def categorize(x):
if 0 < x <= .5:
return int(1)
if .5 < x <= 1.0:
return int(2)
if 1.0 < x <= 1.5:
return int(3)
if 1.5 < x <= 2.0:
return int(4)
if 2.0 < x <= 3.0:
return int(5)
else:
return int(6)
df_carbon['ctax_class'] = df_carbon['tax_per_income'].apply(categorize)
In [190]:
df_carbon.describe()
Out[190]:
In [192]:
#define new df for Random Forest Classifier
df = df_carbon[['z_pop', 'z_land','z_comm_miles','z_comm_miles_ph','z_house_unit','z_carb_ton_ph','z_carbon_tons','total_carbon_tax','z_pov']]
labels = df_carbon['ctax_class']
#Fill NaNs with mean
df = df.apply(lambda x:x.fillna(x.value_counts().index[0]))
In [193]:
def feature_importance(X, y):
y_train = y
model = RandomForestClassifier(n_estimators=25)
clf = model.fit(X, y_train)
feature_importance = clf.feature_importances_.tolist()
features = X.columns.tolist()
df_features = pd.DataFrame( {'Feature_Importance':feature_importance,'Features':features})
df_features = df_features[['Features','Feature_Importance']].sort_values('Feature_Importance', ascending=False)
return df_features
#Run function and create feature importance dataframe
df_features = feature_importance(df, labels)
In [194]:
#Create easier to read column names
feature_importance_list = ['Poverty Rate','CTon/Household','Commuter Miles/House','Population','# of House Units',
'Commuter Miles','Land Area','Total Carbon Tax','Carbon Tons']
In [198]:
import plotly.plotly as py
import plotly.graph_objs as go
data = [go.Bar(
x= feature_importance_list,
y= df_features['Feature_Importance'].tolist()
)]
py.iplot(data, filename='basic-bar')
Out[198]:
In [ ]: