Carbon Tax Analysis


In [197]:
#Initial setup 
import pandas as pd
import numpy as np
from IPython.display import Image
import plotly.plotly as py
import plotly.graph_objs as go

plotly.tools.set_credentials_file(username='crazysauce', api_key='NrGsrsXG6dSp3Ek1QnAy')
# plotly.tools.set_credentials_file(username='adam_owens', api_key='2RhvW0YButjQW520nRWu')


from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))



In [2]:
df_carbon = pd.read_csv('/Users/adamowens/datascience/zcarbon_v2.csv')
df_carbon.describe()


Out[2]:
z_lat_d z_long_d z_land z_water z_pop z_households z_comm z_med_inc z_house_unit z_comm_miles z_comm_miles_ph z_carb_ton_ph z_id z_pov z_per_comm
count 26170.000000 26170.000000 26170.000000 26170.000000 26170.000000 26170.000000 26170.000000 24709.00000 26170.000000 2.617000e+04 26031.000000 26031.000000 26170.000000 25548.000000 25758.000000
mean 38.325589 -88.969161 52.803052 1.253387 11308.166794 4271.586855 2826.670462 55545.24740 4773.933053 4.341181e+07 19028.372806 7.902110 46426.915705 11.037940 78.700978
std 4.924491 14.044077 83.886523 4.402078 14713.765187 5468.304529 3678.442110 23311.76608 5967.497131 5.732325e+07 7298.145215 3.030776 27645.920808 10.359998 13.517806
min 17.963613 -158.185151 0.002000 0.000000 0.000000 0.000000 0.000000 3479.00000 0.000000 0.000000e+00 0.000000 0.000000 601.000000 0.000000 0.000000
25% 35.067059 -94.930508 6.954000 0.018000 1119.250000 413.000000 279.250000 40659.00000 521.250000 5.581571e+06 13632.368576 5.661255 23047.750000 4.200000 75.500000
50% 39.320292 -85.841082 27.855500 0.194000 4348.000000 1625.500000 1086.500000 50923.00000 1931.500000 1.932455e+07 19653.436688 8.161687 45134.000000 8.500000 81.300000
75% 41.697514 -78.917872 66.745500 0.891000 17115.250000 6629.750000 4278.750000 64732.00000 7449.000000 6.003442e+07 24886.945377 10.335060 68742.500000 14.800000 85.800000
max 65.390183 -65.295977 3529.043000 248.932000 113916.000000 43456.000000 28479.000000 236500.00000 47617.000000 5.225270e+08 32496.470500 13.495146 99790.000000 100.000000 100.000000

In [23]:
#Key statistics
df_key = df_carbon[['z_carbon_tons','z_carb_ton_ph','carbon_tax_per_capita','tax_per_income']]
df_key = df_key.rename(columns={"z_carbon_tons": "C_Tons", "z_carb_ton_ph": "C_Tons_PH","carbon_tax_per_capita": "C_Tax_capita", "tax_per_income": "C_Tax_per_inc"})
df_key.describe()


Out[23]:
C_Tons C_Tons_PH C_Tax_capita C_Tax_per_inc
count 26031.000000 26031.000000 26031.000000 24708.000000
mean 30729.244942 7.902110 544.136045 1.156416
std 38626.082451 3.030776 504.534513 1.020256
min 0.000000 0.000000 0.000000 0.000000
25% 4481.615082 5.661255 322.617067 0.530283
50% 14803.891124 8.161687 491.646494 0.941361
75% 43335.972976 10.335060 671.301651 1.505808
max 349412.195861 13.495146 26182.854553 42.747052

Map of Carbon Ton Per Household


In [3]:
#Data Prep
df_carbon['z_id'] = df_carbon['z_id'].apply(lambda x: str(x))
df_carbon['z_carbon'] = df_carbon['z_carb_ton_ph'].apply(lambda x: "%.3f" % x)

df_carbon['text'] = 'Zip Code '+df_carbon['z_id'] + ' <br> '+(df_carbon['z_carbon']).astype(str)+' Carbon Tons'

In [178]:
#### //////// Create Map //////// ####

colors = ['#77AF9C','#4ea1d3','#FFBC42','#D81159']
limits = [(0,8),(8,10),(10,12),(12,15)]
zip_codes = []
scale = 1

for i in range(len(limits)):
    lim = limits[i]
    df_sub = df_carbon.ix[(df_carbon['z_carb_ton_ph'] > lim[0]) & 
                          (df_carbon['z_carb_ton_ph'] < lim[1])]
    zip_code = dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lon = df_sub['z_long_d'],
        lat = df_sub['z_lat_d'],
        text = df_sub['text'],
        marker = dict(
            size = df_sub['z_carb_ton_ph']/scale,
            color = colors[i],
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1]) )
    zip_codes.append(zip_code)

layout = dict(
        title = '2014 Carbon Tons Per Household',
        showlegend = True,
        width = 1100,
        height = 750,
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"),)

fig = dict( data=zip_codes, layout=layout )
py.iplot( fig, validate=False, filename='Carbon Tons Per Household-map-populations' )


Out[178]:

Map of Carbon Ton Pollution


In [179]:
#### //////// Similar exercise as the previous map //////// ####

df_carbon['z_carbon_tons'] = df_carbon['z_house_unit'] * df_carbon['z_carb_ton_ph']
df_carbon['text2'] = 'Zip Code '+df_carbon['z_id'] + ' <br> '+(df_carbon['z_carbon_tons']).astype(str)+' Carbon Tons'

In [180]:
def perc_carbon(n):
    carbon = np.array(df_carbon['z_carbon_tons'].sort_values())
    percentile = np.nanpercentile(carbon, n)
    
    return percentile

In [181]:
limits = [(0,25),(25,50),(50,75),(75,100)]
colors = ['#77AF9C','#4ea1d3','#FFBC42','#D81159']
zip_codes = []
scale = 10000

for i in range(len(limits)):
    lim = limits[i]
    df_sub = df_carbon.ix[(df_carbon['z_carbon_tons'] > perc_carbon(lim[0])) & 
                           (df_carbon['z_carbon_tons'] < perc_carbon(lim[1]))]
    zip_code = dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lon = df_sub['z_long_d'],
        lat = df_sub['z_lat_d'],
        text = df_sub['text2'],
        marker = dict(
            size = df_sub['z_carbon_tons']/scale,
            color = colors[i],
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1]) )
    zip_codes.append(zip_code)

layout = dict(
        title = '2014 Carbon Tons<br>(By Percentile)',
        showlegend = True,
        width=1800,
        height=1200,    
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"),)

fig = dict( data=zip_codes, layout=layout )
py.iplot( fig, validate=False, filename='carbon-tons-percentile.png' )


Out[181]:

Map of Carbon Tax Per Capita


In [182]:
#### //////// Similar exercise as the previous map //////// ####

# Model assumes $140/Carbon ton per person (annually)
tax = 140
df_carbon['total_carbon_tax']= df_carbon['z_carbon_tons'] * tax
df_carbon['carbon_tax_per_capita']= df_carbon['total_carbon_tax'] / df_carbon['z_pop']

In [183]:
def perc_carbon_pop(n):
    carbon = np.array(df_carbon['carbon_tax_per_capita'].sort_values())
    percentile = round(np.nanpercentile(carbon, n), 2)
    
    return percentile

In [184]:
limits = [(perc_carbon_pop(0),perc_carbon_pop(90)),(perc_carbon_pop(90),perc_carbon_pop(95))
          ,(perc_carbon_pop(95),perc_carbon_pop(99)),(perc_carbon_pop(99),perc_carbon_pop(100))]
colors = ["lightgrey","black","orange","red"]
zip_codes = []
scale = 200

for i in range(len(limits)):
    lim = limits[i]
    df_sub = df_carbon.ix[(df_carbon['carbon_tax_per_capita'] > lim[0]) & 
                           (df_carbon['carbon_tax_per_capita'] < lim[1])]
    zip_code = dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lon = df_sub['z_long_d'],
        lat = df_sub['z_lat_d'],
        text = df_sub['text2'],
        marker = dict(
            size = df_sub['carbon_tax_per_capita']/scale,
            color = colors[i],
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1]) )
    zip_codes.append(zip_code)

layout = dict(
        title = '2014 Carbon Tax Per Capita <br>(By Percentile)',
        showlegend = True,
        width=1800,
        height=1200,

        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)")
    ,)

fig = dict( data=zip_codes, layout=layout )
py.iplot( fig, validate=False, filename='carbon_tax_140' )


Out[184]:

Map of Carbon Tax Per Household


In [ ]:
#### //////// Similar exercise as the previous map //////// ####

# Model assumes $140/Carbon ton per person (annually)
tax = 140
df_carbon['carbon_tax_per_house']= df_carbon['z_carb_ton_ph'] * tax

In [10]:
def perc_carbon_house(n):
    carbon = np.array(df_carbon['carbon_tax_per_house'].sort_values())
    percentile = round(np.nanpercentile(carbon, n), 2)
    
    return percentile

In [185]:
limits = [(perc_carbon_house(0),perc_carbon_house(25)),(perc_carbon_house(25),perc_carbon_house(50))
          ,(perc_carbon_house(50),perc_carbon_house(75)),(perc_carbon_house(75),perc_carbon_house(100))]
colors = ['#77AF9C','#4ea1d3','#FFBC42','#D81159']
zip_codes = []
scale = 200

for i in range(len(limits)):
    lim = limits[i]
    df_sub = df_carbon.ix[(df_carbon['carbon_tax_per_capita'] > lim[0]) & 
                           (df_carbon['carbon_tax_per_capita'] < lim[1])]
    zip_code = dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lon = df_sub['z_long_d'],
        lat = df_sub['z_lat_d'],
        text = df_sub['text2'],
        marker = dict(
            size = df_sub['z_households']/scale,
            color = colors[i],
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1]) )
    zip_codes.append(zip_code)

layout = dict(
        title = '2014 Carbon Tax Per Household (Quartiles)<br>(Sized by Population)',
        showlegend = True,
        width=1800,
        height=1200,

        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"
        ),
    )

fig = dict( data=zip_codes, layout=layout )
py.iplot( fig, validate=False, filename='carbon_tax_140' )


Out[185]:

Map of Carbon Tax as % of Income


In [ ]:
#### //////// Similar exercise as the previous map //////// ####

# Model assumes $140/Carbon ton per person (annually)
df_carbon['tax_per_income']= (df_carbon['carbon_tax_per_capita']  / df_carbon['z_med_inc'])*100

In [186]:
def perc_tax_income(n):
    carbon = np.array(df_carbon['tax_per_income'].sort_values())
    percentile = round(np.nanpercentile(carbon, n), 2)
    
    return percentile

In [187]:
limits = [(perc_tax_income(0),perc_tax_income(25)),(perc_tax_income(25),perc_tax_income(50))
          ,(perc_tax_income(50),perc_tax_income(75)),(perc_tax_income(75),perc_tax_income(100))]
colors = ['#77AF9C','#4ea1d3','#FFBC42','#D81159']
zip_codes = []
scale = 1000

for i in range(len(limits)):
    lim = limits[i]
    df_sub = df_carbon.ix[(df_carbon['tax_per_income'] > lim[0]) & 
                           (df_carbon['tax_per_income'] < lim[1])]
    zip_code = dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lon = df_sub['z_long_d'],
        lat = df_sub['z_lat_d'],
        text = df_sub['text2'],
        marker = dict(
            size = df_sub['z_pop']/scale,
            color = colors[i],
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1]) )
    zip_codes.append(zip_code)

layout = dict(
        title = '2014 Carbon Tax as Percentage of Income (Quartiles)<br>(Sized by Population)',
        showlegend = True,
        width=1800,
        height = 1200,
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"
        ),
    )

fig = dict( data=zip_codes, layout=layout )
py.iplot( fig, validate=False, filename='carbon_tax_140' )


---------------------------------------------------------------------------
PlotlyError                               Traceback (most recent call last)
<ipython-input-187-e9413ada0de5> in <module>()
     42 
     43 fig = dict( data=zip_codes, layout=layout )
---> 44 py.iplot( fig, validate=False, filename='carbon_tax_140' )

/Users/adamowens/anaconda/lib/python2.7/site-packages/plotly/plotly/plotly.pyc in iplot(figure_or_data, **plot_options)
    149     if 'auto_open' not in plot_options:
    150         plot_options['auto_open'] = False
--> 151     url = plot(figure_or_data, **plot_options)
    152 
    153     if isinstance(figure_or_data, dict):

/Users/adamowens/anaconda/lib/python2.7/site-packages/plotly/plotly/plotly.pyc in plot(figure_or_data, validate, **plot_options)
    239 
    240     plot_options = _plot_option_logic(plot_options)
--> 241     res = _send_to_plotly(figure, **plot_options)
    242     if res['error'] == '':
    243         if plot_options['auto_open']:

/Users/adamowens/anaconda/lib/python2.7/site-packages/plotly/plotly/plotly.pyc in _send_to_plotly(figure, **plot_options)
   1427 
   1428     if 'error' in r and r['error'] != '':
-> 1429         raise exceptions.PlotlyError(r['error'])
   1430 
   1431     # Check if the url needs a secret key

PlotlyError: Hey there! You've hit one of our API request limits. 

To get unlimited API calls(10,000/day), please upgrade to a paid plan. 

UPGRADE HERE: https://goo.gl/i7glmM 

Thanks for using Plotly! Happy Plotting!

Tax on Percentage of Population by Income Levels


In [ ]:
import plotly.plotly as py
import plotly.graph_objs as go

In [168]:
#create new variable measuring carbon tax as % of income
df_carbon['tax_per_income']= (df_carbon['carbon_tax_per_capita']  / df_carbon['z_med_inc'])*100

#for percentile analysis, not used in final presentation
def perc_tax_income_feature(feature):
    perc_list = []
    carbon = np.array(feature.sort_values())
    percentiles = [25, 50, 75, 100]
    
    for i in percentiles:
        perc = round(np.nanpercentile(carbon, i), 3)
        perc_list.append(perc)
    
    return perc_list

tax_income_perc = perc_tax_income_feature(df_carbon['tax_per_income'])

c_tax_perc_list = [.5, 1, 1.5, 2, 3, 42.747]

In [169]:
#function to return buckets of population levels or percentiles

def ctax_income_sum(df, tax_list):
    perc_sum = [0]
    perc_inc = tax_list
    
    for i in perc_inc:
        pop_sum = sum(df['z_pop'].ix[(df['tax_per_income'] < i)])
        perc_sum.append(pop_sum)
    
    buckets = [y - x for x,y in zip(perc_sum,perc_sum[1:])]

    return buckets

def ctax_income_perc(df, tax_list):
    perc_sum = [0]
    perc_inc = tax_list
    
    for i in perc_inc:
        pop_sum = sum(df['z_pop'].ix[(df['tax_per_income'] < i)])
        perc_sum.append(pop_sum)
    
    buckets = [y - x for x,y in zip(perc_sum,perc_sum[1:])]
    buckets2 = [float(i) for i in buckets]
    
    perc_buckets = [round(x*100/sum(buckets2),2) for x in buckets2]
    return perc_buckets

In [170]:
income_less_25k = df_carbon.ix[df_carbon['z_med_inc'] < 25000]
income_40k = df_carbon.ix[(df_carbon['z_med_inc'] > 25000) & (df_carbon['z_med_inc'] < 40000)]
income_50k = df_carbon.ix[(df_carbon['z_med_inc'] > 40000) & (df_carbon['z_med_inc'] < 50000)]
income_60k = df_carbon.ix[(df_carbon['z_med_inc'] > 50000) & (df_carbon['z_med_inc'] < 60000)]
income_70k = df_carbon.ix[(df_carbon['z_med_inc'] > 60000) & (df_carbon['z_med_inc'] < 70000)]
income_80k = df_carbon.ix[(df_carbon['z_med_inc'] > 70000) & (df_carbon['z_med_inc'] < 80000)]
income_90k = df_carbon.ix[(df_carbon['z_med_inc'] > 80000) & (df_carbon['z_med_inc'] < 90000)]
income_100k = df_carbon.ix[(df_carbon['z_med_inc'] > 90000) & (df_carbon['z_med_inc'] < 100000)]
income_125k = df_carbon.ix[(df_carbon['z_med_inc'] > 100000) & (df_carbon['z_med_inc'] < 125000)]
income_more_125k = df_carbon.ix[df_carbon['z_med_inc'] > 125000]

In [172]:
#buckets for aggregrate population levels
inc_less_25k_sum = ctax_income_sum(income_less_25k, c_tax_perc_list)
inc_40k_sum = ctax_income_sum(income_40k, c_tax_perc_list)
inc_50k_sum = ctax_income_sum(income_50k, c_tax_perc_list)
inc_60k_sum = ctax_income_sum(income_60k, c_tax_perc_list)
inc_70k_sum = ctax_income_sum(income_70k, c_tax_perc_list)
inc_80k_sum = ctax_income_sum(income_80k, c_tax_perc_list)
inc_90k_sum = ctax_income_sum(income_90k, c_tax_perc_list)
inc_100k_sum = ctax_income_sum(income_100k, c_tax_perc_list)
inc_125k_sum = ctax_income_sum(income_125k, c_tax_perc_list)
inc_more_125k_sum = ctax_income_sum(income_more_125k, c_tax_perc_list)

#buckets for aggregrate percentages
inc_less_25k_perc = ctax_income_perc(income_less_25k, c_tax_perc_list)
inc_40k_perc = ctax_income_perc(income_40k, c_tax_perc_list)
inc_50k_perc = ctax_income_perc(income_50k, c_tax_perc_list)
inc_60k_perc = ctax_income_perc(income_60k, c_tax_perc_list)
inc_70k_perc = ctax_income_perc(income_70k, c_tax_perc_list)
inc_80k_perc = ctax_income_perc(income_80k, c_tax_perc_list)
inc_90k_perc = ctax_income_perc(income_90k, c_tax_perc_list)
inc_100k_perc = ctax_income_perc(income_100k, c_tax_perc_list)
inc_125k_perc = ctax_income_perc(income_125k, c_tax_perc_list)
inc_more_125k_perc = ctax_income_perc(income_more_125k, c_tax_perc_list)

In [173]:
#Split levels for legend bar on top
split_list = [1/6.,1/6.,1/6.,1/6.,1/6.,1/6.]
a_list = [round(x*100,2) for x in split_list]

In [174]:
#### /////////  Bar Graph for Percentages  ///////// ####

top_labels = ['<0.5%','0.5-1.0%', '1.0-1.50%', '1.5-2.0%', '2.0-3.0%', '>3%']

colors = ['#77AF9C','#4ea1d3','#9055A2','#CE6D39', '#FFBC42','#D81159']

x_data = [inc_less_25k_perc, inc_40k_perc, inc_50k_perc, inc_60k_perc, inc_70k_perc, inc_80k_perc, inc_90k_perc, 
          inc_100k_perc, inc_125k_perc, inc_more_125k_perc, a_list]

y_data = ['Less than $25k','$25-40k','$40-50k','$50-60k','$60-70k','$70-80k','$80-90k','$90-100k','$100-125k',
          'More than $125k','Carbon Tax as % of Income']

traces = []

for i in range(0, len(x_data[0])):
    for xd, yd in zip(x_data, y_data):
        traces.append(go.Bar(
            x=xd[i],
            y=yd,
            orientation='h',
            marker=dict(
                color=colors[i],
                line=dict(
                        color='rgb(248, 248, 249)',
                        width=1))))

layout = go.Layout(
    xaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
        domain=[0.15, 1]
    ),
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
    ),
    barmode='stack',
    plot_bgcolor='rgb(248, 248, 255)',
    margin=dict(
        l=5,
        r=5,
        t=70,
        b=70
    ),
    showlegend=False,)

annotations = []

for yd, xd in zip(y_data, x_data):
    # labeling the y-axis
    annotations.append(dict(xref='paper', yref='y',
                            x=0.14, y=yd,
                            xanchor='right',
                            text=str(yd),
                            font=dict(family='Arial', size=14,
                                      color='rgb(67, 67, 67)'),
                            showarrow=False, align='right'))
    # labeling the first percentage of each bar (x_axis)
    annotations.append(dict(xref='x', yref='y',
                            x=xd[0] / 2, y=yd,
                            text=str(xd[0]) + '%',
                            font=dict(family='Arial', size=10,
                                      color='rgb(248, 248, 255)'),
                            showarrow=False))
    # labeling the first Likert scale (on the top)
    if yd == y_data[-1]:
        annotations.append(dict(xref='x', yref='paper',
                                x=xd[0] / 2, y=1.1,
                                text=top_labels[0],
                                font=dict(family='Arial', size=13,
                                          color='rgb(67, 67, 67)'),
                                showarrow=False))
    space = xd[0]
    for i in range(1, len(xd)):
            # labeling the rest of percentages for each bar (x_axis)
            annotations.append(dict(xref='x', yref='y',
                                    x=space + (xd[i]/2), y=yd, 
                                    text=str(xd[i]) + '%',
                                    font=dict(family='Arial', size=10,
                                              color='rgb(248, 248, 255)'),
                                    showarrow=False))
            # labeling the Likert scale
            if yd == y_data[-1]:
                annotations.append(dict(xref='x', yref='paper',
                                        x=space + (xd[i]/2), y=1.1,
                                        text=top_labels[i],
                                        font=dict(family='Arial', size=13,
                                                  color='rgb(67, 67, 67)'),
                                        showarrow=False))
            space += xd[i]

layout['annotations'] = annotations

fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename='income-percentage')


Out[174]:

In [176]:
#sizing legend bar by the largest category 
split_list = [1/6.,1/6.,1/6.,1/6.,1/6.,1/6.]
b_list = [int(round(x*sum(inc_50k_sum),0)) for x in split_list]

In [177]:
#### /////////  Bar Graph for Aggregrate Populations  ///////// ####

top_labels = ['<0.5%','0.5-1.0%', '1.0-1.50%', '1.5-2.0%', '2.0-3.0%', '>3%']

colors = ['#77AF9C','#4ea1d3','#9055A2','#CE6D39', '#FFBC42','#D81159']

x_data = [inc_less_25k_sum, inc_40k_sum, inc_50k_sum, inc_60k_sum, inc_70k_sum, inc_80k_sum, inc_90k_sum, inc_100k_sum, 
          inc_125k_sum, inc_more_125k_sum, b_list]

y_data = ['Less than $25k','$25-40k','$40-50k','$50-60k','$60-70k','$70-80k','$80-90k','$90-100k','$100-125k',
          'More than $125k','Carbon Tax as % of Income']

traces = []

for i in range(0, len(x_data[0])):
    for xd, yd in zip(x_data, y_data):
        traces.append(go.Bar(
            x=xd[i],
            y=yd,
            orientation='h',
            marker=dict(
                color=colors[i],
                line=dict(
                        color='rgb(248, 248, 249)',
                        width=1))))

layout = go.Layout(
    xaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
        domain=[0.15, 1]
    ),
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
    ),
    barmode='stack',
    plot_bgcolor='rgb(248, 248, 255)',
    margin=dict(
        l=5,
        r=5,
        t=70,
        b=70
    ),
    showlegend=False,)

annotations = []

for yd, xd in zip(y_data, x_data):
    # labeling the y-axis
    annotations.append(dict(xref='paper', yref='y',
                            x=0.14, y=yd,
                            xanchor='right',
                            text=str(yd),
                            font=dict(family='Arial', size=14,
                                      color='rgb(67, 67, 67)'),
                            showarrow=False, align='right'))
    # labeling the first percentage of each bar (x_axis)
    annotations.append(dict(xref='x', yref='y',
                            x=xd[0] / 2, y=yd,
                            text=str(xd[0]/1000) + 'k',
                            font=dict(family='Arial', size=10,
                                      color='rgb(248, 248, 255)'),
                            showarrow=False))
    # labeling the first Likert scale (on the top)
    if yd == y_data[-1]:
        annotations.append(dict(xref='x', yref='paper',
                                x=xd[0] / 2, y=1.1,
                                text=top_labels[0],
                                font=dict(family='Arial', size=13,
                                          color='rgb(67, 67, 67)'),
                                showarrow=False))
    space = xd[0]
    for i in range(1, len(xd)):
            # labeling the rest of percentages for each bar (x_axis)
            annotations.append(dict(xref='x', yref='y',
                                    x=space + (xd[i]/2), y=yd, 
                                    text=str(xd[i]/1000) + 'k',
                                    font=dict(family='Arial', size=10,
                                              color='rgb(248, 248, 255)'),
                                    showarrow=False))
            # labeling the Likert scale
            if yd == y_data[-1]:
                annotations.append(dict(xref='x', yref='paper',
                                        x=space + (xd[i]/2), y=1.1,
                                        text=top_labels[i],
                                        font=dict(family='Arial', size=13,
                                                  color='rgb(67, 67, 67)'),
                                        showarrow=False))
            space += xd[i]

layout['annotations'] = annotations

fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename='income-percentage')


Out[177]:

Distribution Plots


In [188]:
import plotly.plotly as py
from plotly.tools import FigureFactory as FF

In [ ]:
df_carb_ton = df_carbon['z_carb_ton_ph'].dropna()

In [165]:
# Caron Ton / Household PDF

fig = FF.create_distplot([df_carb_ton], [''],show_rug=False,bin_size=.05)
fig['layout'].update(title='Distibution Plot - Carbon Tons Per Household', 
                     width= 500, height = 300,
                        yaxis=dict(autotick=False,
                                   ticks='outside',
                                   tick0=0,
                                   dtick=0.02,
                                   ticklen=.5,
                                   tickwidth=1,
                                   tickcolor='#000',
                                  ))

py.iplot(fig,filename='distplot-carbon-tax',validate=False)


Out[165]:

In [166]:
# CTax/Capita PDF

df_carb_tax =  df_carbon['carbon_tax_per_capita'].ix[df_carbon['carbon_tax_per_capita'] <10000].dropna()

fig = FF.create_distplot([df_carb_tax], [''],show_rug=False,bin_size=100)
fig['layout'].update(title='Distribution Plot - Carbon Tax Per Capita', 
                     width= 600, height = 400)

py.iplot(fig,filename='distplot-carbon-tax',validate=False)


Out[166]:

In [167]:
#CTax/Income

df_tax_income = df_carbon['tax_per_income'].ix[df_carbon['tax_per_income'] <15].dropna()

fig = FF.create_distplot([df_tax_income], [''],show_rug=False,bin_size=.05)
fig['layout'].update(title='Distribution Plot - Carbon Tax as % of Median Income', 
                     width= 600, height = 400)

py.iplot(fig,filename='distplot-carbon-tax',validate=False)


Out[167]:

Random Forest Feature Importance Analysis


In [ ]:
from sklearn.ensemble import RandomForestClassifier

In [189]:
# Create categories from CTax/Income

def categorize(x):
    if 0 < x <= .5:
        return int(1)
    if .5 < x <= 1.0:
        return int(2)
    if 1.0 < x <= 1.5:
        return int(3)
    if 1.5 < x <= 2.0:
        return int(4)
    if 2.0 < x <= 3.0:
        return int(5)
    
    else:
        return int(6)

df_carbon['ctax_class'] = df_carbon['tax_per_income'].apply(categorize)

In [190]:
df_carbon.describe()


Out[190]:
z_lat_d z_long_d z_land z_water z_pop z_households z_comm z_med_inc z_house_unit z_comm_miles z_comm_miles_ph z_carb_ton_ph z_pov z_per_comm z_carbon_tons total_carbon_tax carbon_tax_per_capita carbon_tax_per_house tax_per_income ctax_class
count 26170.000000 26170.000000 26170.000000 26170.000000 26170.000000 26170.000000 26170.000000 24709.00000 26170.000000 2.617000e+04 26031.000000 26031.000000 25548.000000 25758.000000 26031.000000 26031.000000 26031.000000 26031.000000 24708.000000 26170.000000
mean 38.325589 -88.969161 52.803052 1.253387 11308.166794 4271.586855 2826.670462 55545.24740 4773.933053 4.341181e+07 19028.372806 7.902110 11.037940 78.700978 30729.244942 4302094.291817 544.136045 1106.295368 1.156416 2.840352
std 4.924491 14.044077 83.886523 4.402078 14713.765187 5468.304529 3678.442110 23311.76608 5967.497131 5.732325e+07 7298.145215 3.030776 10.359998 13.517806 38626.082451 5407651.543104 504.534513 424.308706 1.020256 1.544071
min 17.963613 -158.185151 0.002000 0.000000 0.000000 0.000000 0.000000 3479.00000 0.000000 0.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
25% 35.067059 -94.930508 6.954000 0.018000 1119.250000 413.000000 279.250000 40659.00000 521.250000 5.581571e+06 13632.368576 5.661255 4.200000 75.500000 4481.615082 627426.111442 322.617067 792.575717 0.530283 2.000000
50% 39.320292 -85.841082 27.855500 0.194000 4348.000000 1625.500000 1086.500000 50923.00000 1931.500000 1.932455e+07 19653.436688 8.161687 8.500000 81.300000 14803.891124 2072544.757309 491.646494 1142.636115 0.941361 2.000000
75% 41.697514 -78.917872 66.745500 0.891000 17115.250000 6629.750000 4278.750000 64732.00000 7449.000000 6.003442e+07 24886.945377 10.335060 14.800000 85.800000 43335.972976 6067036.216633 671.301651 1446.908397 1.505808 4.000000
max 65.390183 -65.295977 3529.043000 248.932000 113916.000000 43456.000000 28479.000000 236500.00000 47617.000000 5.225270e+08 32496.470500 13.495146 100.000000 100.000000 349412.195861 48917707.420608 26182.854553 1889.320498 42.747052 6.000000

In [192]:
#define new df for Random Forest Classifier
df = df_carbon[['z_pop', 'z_land','z_comm_miles','z_comm_miles_ph','z_house_unit','z_carb_ton_ph','z_carbon_tons','total_carbon_tax','z_pov']]

labels = df_carbon['ctax_class']

#Fill NaNs with mean
df = df.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [193]:
def feature_importance(X, y):
    y_train = y
    model = RandomForestClassifier(n_estimators=25)
    clf = model.fit(X, y_train)

    feature_importance = clf.feature_importances_.tolist()
    features = X.columns.tolist()
    
    df_features = pd.DataFrame( {'Feature_Importance':feature_importance,'Features':features})

    df_features = df_features[['Features','Feature_Importance']].sort_values('Feature_Importance', ascending=False)
    
    return df_features

#Run function and create feature importance dataframe
df_features = feature_importance(df, labels)

In [194]:
#Create easier to read column names 
feature_importance_list = ['Poverty Rate','CTon/Household','Commuter Miles/House','Population','# of House Units',
                           'Commuter Miles','Land Area','Total Carbon Tax','Carbon Tons']

In [198]:
import plotly.plotly as py
import plotly.graph_objs as go

data = [go.Bar(
            x= feature_importance_list,
            y= df_features['Feature_Importance'].tolist()
    )]

py.iplot(data, filename='basic-bar')


Out[198]:

In [ ]: