Carbon Tax Analysis



In [197]:

    
#Initial setup 
import pandas as pd
import numpy as np
from IPython.display import Image
import plotly.plotly as py
import plotly.graph_objs as go

plotly.tools.set_credentials_file(username='crazysauce', api_key='NrGsrsXG6dSp3Ek1QnAy')
# plotly.tools.set_credentials_file(username='adam_owens', api_key='2RhvW0YButjQW520nRWu')


from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))



In [2]:

    
df_carbon = pd.read_csv('/Users/adamowens/datascience/zcarbon_v2.csv')
df_carbon.describe()









    Out[2]:






  
    
      
      z_lat_d
      z_long_d
      z_land
      z_water
      z_pop
      z_households
      z_comm
      z_med_inc
      z_house_unit
      z_comm_miles
      z_comm_miles_ph
      z_carb_ton_ph
      z_id
      z_pov
      z_per_comm
    
  
  
    
      count
      26170.000000
      26170.000000
      26170.000000
      26170.000000
      26170.000000
      26170.000000
      26170.000000
      24709.00000
      26170.000000
      2.617000e+04
      26031.000000
      26031.000000
      26170.000000
      25548.000000
      25758.000000
    
    
      mean
      38.325589
      -88.969161
      52.803052
      1.253387
      11308.166794
      4271.586855
      2826.670462
      55545.24740
      4773.933053
      4.341181e+07
      19028.372806
      7.902110
      46426.915705
      11.037940
      78.700978
    
    
      std
      4.924491
      14.044077
      83.886523
      4.402078
      14713.765187
      5468.304529
      3678.442110
      23311.76608
      5967.497131
      5.732325e+07
      7298.145215
      3.030776
      27645.920808
      10.359998
      13.517806
    
    
      min
      17.963613
      -158.185151
      0.002000
      0.000000
      0.000000
      0.000000
      0.000000
      3479.00000
      0.000000
      0.000000e+00
      0.000000
      0.000000
      601.000000
      0.000000
      0.000000
    
    
      25%
      35.067059
      -94.930508
      6.954000
      0.018000
      1119.250000
      413.000000
      279.250000
      40659.00000
      521.250000
      5.581571e+06
      13632.368576
      5.661255
      23047.750000
      4.200000
      75.500000
    
    
      50%
      39.320292
      -85.841082
      27.855500
      0.194000
      4348.000000
      1625.500000
      1086.500000
      50923.00000
      1931.500000
      1.932455e+07
      19653.436688
      8.161687
      45134.000000
      8.500000
      81.300000
    
    
      75%
      41.697514
      -78.917872
      66.745500
      0.891000
      17115.250000
      6629.750000
      4278.750000
      64732.00000
      7449.000000
      6.003442e+07
      24886.945377
      10.335060
      68742.500000
      14.800000
      85.800000
    
    
      max
      65.390183
      -65.295977
      3529.043000
      248.932000
      113916.000000
      43456.000000
      28479.000000
      236500.00000
      47617.000000
      5.225270e+08
      32496.470500
      13.495146
      99790.000000
      100.000000
      100.000000



In [23]:

    
#Key statistics
df_key = df_carbon[['z_carbon_tons','z_carb_ton_ph','carbon_tax_per_capita','tax_per_income']]
df_key = df_key.rename(columns={"z_carbon_tons": "C_Tons", "z_carb_ton_ph": "C_Tons_PH","carbon_tax_per_capita": "C_Tax_capita", "tax_per_income": "C_Tax_per_inc"})
df_key.describe()









    Out[23]:






  
    
      
      C_Tons
      C_Tons_PH
      C_Tax_capita
      C_Tax_per_inc
    
  
  
    
      count
      26031.000000
      26031.000000
      26031.000000
      24708.000000
    
    
      mean
      30729.244942
      7.902110
      544.136045
      1.156416
    
    
      std
      38626.082451
      3.030776
      504.534513
      1.020256
    
    
      min
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      4481.615082
      5.661255
      322.617067
      0.530283
    
    
      50%
      14803.891124
      8.161687
      491.646494
      0.941361
    
    
      75%
      43335.972976
      10.335060
      671.301651
      1.505808
    
    
      max
      349412.195861
      13.495146
      26182.854553
      42.747052

Map of Carbon Ton Per Household



In [3]:

    
#Data Prep
df_carbon['z_id'] = df_carbon['z_id'].apply(lambda x: str(x))
df_carbon['z_carbon'] = df_carbon['z_carb_ton_ph'].apply(lambda x: "%.3f" % x)

df_carbon['text'] = 'Zip Code '+df_carbon['z_id'] + ' <br> '+(df_carbon['z_carbon']).astype(str)+' Carbon Tons'



In [178]:

    
#### //////// Create Map //////// ####

colors = ['#77AF9C','#4ea1d3','#FFBC42','#D81159']
limits = [(0,8),(8,10),(10,12),(12,15)]
zip_codes = []
scale = 1

for i in range(len(limits)):
    lim = limits[i]
    df_sub = df_carbon.ix[(df_carbon['z_carb_ton_ph'] > lim[0]) & 
                          (df_carbon['z_carb_ton_ph'] < lim[1])]
    zip_code = dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lon = df_sub['z_long_d'],
        lat = df_sub['z_lat_d'],
        text = df_sub['text'],
        marker = dict(
            size = df_sub['z_carb_ton_ph']/scale,
            color = colors[i],
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1]) )
    zip_codes.append(zip_code)

layout = dict(
        title = '2014 Carbon Tons Per Household',
        showlegend = True,
        width = 1100,
        height = 750,
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"),)

fig = dict( data=zip_codes, layout=layout )
py.iplot( fig, validate=False, filename='Carbon Tons Per Household-map-populations' )









    Out[178]:

Map of Carbon Ton Pollution



In [179]:

    
#### //////// Similar exercise as the previous map //////// ####

df_carbon['z_carbon_tons'] = df_carbon['z_house_unit'] * df_carbon['z_carb_ton_ph']
df_carbon['text2'] = 'Zip Code '+df_carbon['z_id'] + ' <br> '+(df_carbon['z_carbon_tons']).astype(str)+' Carbon Tons'



In [180]:

    
def perc_carbon(n):
    carbon = np.array(df_carbon['z_carbon_tons'].sort_values())
    percentile = np.nanpercentile(carbon, n)
    
    return percentile



In [181]:

    
limits = [(0,25),(25,50),(50,75),(75,100)]
colors = ['#77AF9C','#4ea1d3','#FFBC42','#D81159']
zip_codes = []
scale = 10000

for i in range(len(limits)):
    lim = limits[i]
    df_sub = df_carbon.ix[(df_carbon['z_carbon_tons'] > perc_carbon(lim[0])) & 
                           (df_carbon['z_carbon_tons'] < perc_carbon(lim[1]))]
    zip_code = dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lon = df_sub['z_long_d'],
        lat = df_sub['z_lat_d'],
        text = df_sub['text2'],
        marker = dict(
            size = df_sub['z_carbon_tons']/scale,
            color = colors[i],
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1]) )
    zip_codes.append(zip_code)

layout = dict(
        title = '2014 Carbon Tons<br>(By Percentile)',
        showlegend = True,
        width=1800,
        height=1200,    
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"),)

fig = dict( data=zip_codes, layout=layout )
py.iplot( fig, validate=False, filename='carbon-tons-percentile.png' )









    Out[181]:

Map of Carbon Tax Per Capita



In [182]:

    
#### //////// Similar exercise as the previous map //////// ####

# Model assumes $140/Carbon ton per person (annually)
tax = 140
df_carbon['total_carbon_tax']= df_carbon['z_carbon_tons'] * tax
df_carbon['carbon_tax_per_capita']= df_carbon['total_carbon_tax'] / df_carbon['z_pop']



In [183]:

    
def perc_carbon_pop(n):
    carbon = np.array(df_carbon['carbon_tax_per_capita'].sort_values())
    percentile = round(np.nanpercentile(carbon, n), 2)
    
    return percentile



In [184]:

    
limits = [(perc_carbon_pop(0),perc_carbon_pop(90)),(perc_carbon_pop(90),perc_carbon_pop(95))
          ,(perc_carbon_pop(95),perc_carbon_pop(99)),(perc_carbon_pop(99),perc_carbon_pop(100))]
colors = ["lightgrey","black","orange","red"]
zip_codes = []
scale = 200

for i in range(len(limits)):
    lim = limits[i]
    df_sub = df_carbon.ix[(df_carbon['carbon_tax_per_capita'] > lim[0]) & 
                           (df_carbon['carbon_tax_per_capita'] < lim[1])]
    zip_code = dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lon = df_sub['z_long_d'],
        lat = df_sub['z_lat_d'],
        text = df_sub['text2'],
        marker = dict(
            size = df_sub['carbon_tax_per_capita']/scale,
            color = colors[i],
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1]) )
    zip_codes.append(zip_code)

layout = dict(
        title = '2014 Carbon Tax Per Capita <br>(By Percentile)',
        showlegend = True,
        width=1800,
        height=1200,

        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)")
    ,)

fig = dict( data=zip_codes, layout=layout )
py.iplot( fig, validate=False, filename='carbon_tax_140' )









    Out[184]:

Map of Carbon Tax Per Household



In [ ]:

    
#### //////// Similar exercise as the previous map //////// ####

# Model assumes $140/Carbon ton per person (annually)
tax = 140
df_carbon['carbon_tax_per_house']= df_carbon['z_carb_ton_ph'] * tax



In [10]:

    
def perc_carbon_house(n):
    carbon = np.array(df_carbon['carbon_tax_per_house'].sort_values())
    percentile = round(np.nanpercentile(carbon, n), 2)
    
    return percentile



In [185]:

    
limits = [(perc_carbon_house(0),perc_carbon_house(25)),(perc_carbon_house(25),perc_carbon_house(50))
          ,(perc_carbon_house(50),perc_carbon_house(75)),(perc_carbon_house(75),perc_carbon_house(100))]
colors = ['#77AF9C','#4ea1d3','#FFBC42','#D81159']
zip_codes = []
scale = 200

for i in range(len(limits)):
    lim = limits[i]
    df_sub = df_carbon.ix[(df_carbon['carbon_tax_per_capita'] > lim[0]) & 
                           (df_carbon['carbon_tax_per_capita'] < lim[1])]
    zip_code = dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lon = df_sub['z_long_d'],
        lat = df_sub['z_lat_d'],
        text = df_sub['text2'],
        marker = dict(
            size = df_sub['z_households']/scale,
            color = colors[i],
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1]) )
    zip_codes.append(zip_code)

layout = dict(
        title = '2014 Carbon Tax Per Household (Quartiles)<br>(Sized by Population)',
        showlegend = True,
        width=1800,
        height=1200,

        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"
        ),
    )

fig = dict( data=zip_codes, layout=layout )
py.iplot( fig, validate=False, filename='carbon_tax_140' )









    Out[185]:

Map of Carbon Tax as % of Income



In [ ]:

    
#### //////// Similar exercise as the previous map //////// ####

# Model assumes $140/Carbon ton per person (annually)
df_carbon['tax_per_income']= (df_carbon['carbon_tax_per_capita']  / df_carbon['z_med_inc'])*100



In [186]:

    
def perc_tax_income(n):
    carbon = np.array(df_carbon['tax_per_income'].sort_values())
    percentile = round(np.nanpercentile(carbon, n), 2)
    
    return percentile



In [187]:

    
limits = [(perc_tax_income(0),perc_tax_income(25)),(perc_tax_income(25),perc_tax_income(50))
          ,(perc_tax_income(50),perc_tax_income(75)),(perc_tax_income(75),perc_tax_income(100))]
colors = ['#77AF9C','#4ea1d3','#FFBC42','#D81159']
zip_codes = []
scale = 1000

for i in range(len(limits)):
    lim = limits[i]
    df_sub = df_carbon.ix[(df_carbon['tax_per_income'] > lim[0]) & 
                           (df_carbon['tax_per_income'] < lim[1])]
    zip_code = dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lon = df_sub['z_long_d'],
        lat = df_sub['z_lat_d'],
        text = df_sub['text2'],
        marker = dict(
            size = df_sub['z_pop']/scale,
            color = colors[i],
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1]) )
    zip_codes.append(zip_code)

layout = dict(
        title = '2014 Carbon Tax as Percentage of Income (Quartiles)<br>(Sized by Population)',
        showlegend = True,
        width=1800,
        height = 1200,
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"
        ),
    )

fig = dict( data=zip_codes, layout=layout )
py.iplot( fig, validate=False, filename='carbon_tax_140' )









    



---------------------------------------------------------------------------
PlotlyError                               Traceback (most recent call last)
<ipython-input-187-e9413ada0de5> in <module>()
     42 
     43 fig = dict( data=zip_codes, layout=layout )
---> 44 py.iplot( fig, validate=False, filename='carbon_tax_140' )

/Users/adamowens/anaconda/lib/python2.7/site-packages/plotly/plotly/plotly.pyc in iplot(figure_or_data, **plot_options)
    149     if 'auto_open' not in plot_options:
    150         plot_options['auto_open'] = False
--> 151     url = plot(figure_or_data, **plot_options)
    152 
    153     if isinstance(figure_or_data, dict):

/Users/adamowens/anaconda/lib/python2.7/site-packages/plotly/plotly/plotly.pyc in plot(figure_or_data, validate, **plot_options)
    239 
    240     plot_options = _plot_option_logic(plot_options)
--> 241     res = _send_to_plotly(figure, **plot_options)
    242     if res['error'] == '':
    243         if plot_options['auto_open']:

/Users/adamowens/anaconda/lib/python2.7/site-packages/plotly/plotly/plotly.pyc in _send_to_plotly(figure, **plot_options)
   1427 
   1428     if 'error' in r and r['error'] != '':
-> 1429         raise exceptions.PlotlyError(r['error'])
   1430 
   1431     # Check if the url needs a secret key

PlotlyError: Hey there! You've hit one of our API request limits. 

To get unlimited API calls(10,000/day), please upgrade to a paid plan. 

UPGRADE HERE: https://goo.gl/i7glmM 

Thanks for using Plotly! Happy Plotting!

Tax on Percentage of Population by Income Levels



In [ ]:

    
import plotly.plotly as py
import plotly.graph_objs as go



In [168]:

    
#create new variable measuring carbon tax as % of income
df_carbon['tax_per_income']= (df_carbon['carbon_tax_per_capita']  / df_carbon['z_med_inc'])*100

#for percentile analysis, not used in final presentation
def perc_tax_income_feature(feature):
    perc_list = []
    carbon = np.array(feature.sort_values())
    percentiles = [25, 50, 75, 100]
    
    for i in percentiles:
        perc = round(np.nanpercentile(carbon, i), 3)
        perc_list.append(perc)
    
    return perc_list

tax_income_perc = perc_tax_income_feature(df_carbon['tax_per_income'])

c_tax_perc_list = [.5, 1, 1.5, 2, 3, 42.747]



In [169]:

    
#function to return buckets of population levels or percentiles

def ctax_income_sum(df, tax_list):
    perc_sum = [0]
    perc_inc = tax_list
    
    for i in perc_inc:
        pop_sum = sum(df['z_pop'].ix[(df['tax_per_income'] < i)])
        perc_sum.append(pop_sum)
    
    buckets = [y - x for x,y in zip(perc_sum,perc_sum[1:])]

    return buckets

def ctax_income_perc(df, tax_list):
    perc_sum = [0]
    perc_inc = tax_list
    
    for i in perc_inc:
        pop_sum = sum(df['z_pop'].ix[(df['tax_per_income'] < i)])
        perc_sum.append(pop_sum)
    
    buckets = [y - x for x,y in zip(perc_sum,perc_sum[1:])]
    buckets2 = [float(i) for i in buckets]
    
    perc_buckets = [round(x*100/sum(buckets2),2) for x in buckets2]
    return perc_buckets



In [170]:

    
income_less_25k = df_carbon.ix[df_carbon['z_med_inc'] < 25000]
income_40k = df_carbon.ix[(df_carbon['z_med_inc'] > 25000) & (df_carbon['z_med_inc'] < 40000)]
income_50k = df_carbon.ix[(df_carbon['z_med_inc'] > 40000) & (df_carbon['z_med_inc'] < 50000)]
income_60k = df_carbon.ix[(df_carbon['z_med_inc'] > 50000) & (df_carbon['z_med_inc'] < 60000)]
income_70k = df_carbon.ix[(df_carbon['z_med_inc'] > 60000) & (df_carbon['z_med_inc'] < 70000)]
income_80k = df_carbon.ix[(df_carbon['z_med_inc'] > 70000) & (df_carbon['z_med_inc'] < 80000)]
income_90k = df_carbon.ix[(df_carbon['z_med_inc'] > 80000) & (df_carbon['z_med_inc'] < 90000)]
income_100k = df_carbon.ix[(df_carbon['z_med_inc'] > 90000) & (df_carbon['z_med_inc'] < 100000)]
income_125k = df_carbon.ix[(df_carbon['z_med_inc'] > 100000) & (df_carbon['z_med_inc'] < 125000)]
income_more_125k = df_carbon.ix[df_carbon['z_med_inc'] > 125000]



In [172]:

    
#buckets for aggregrate population levels
inc_less_25k_sum = ctax_income_sum(income_less_25k, c_tax_perc_list)
inc_40k_sum = ctax_income_sum(income_40k, c_tax_perc_list)
inc_50k_sum = ctax_income_sum(income_50k, c_tax_perc_list)
inc_60k_sum = ctax_income_sum(income_60k, c_tax_perc_list)
inc_70k_sum = ctax_income_sum(income_70k, c_tax_perc_list)
inc_80k_sum = ctax_income_sum(income_80k, c_tax_perc_list)
inc_90k_sum = ctax_income_sum(income_90k, c_tax_perc_list)
inc_100k_sum = ctax_income_sum(income_100k, c_tax_perc_list)
inc_125k_sum = ctax_income_sum(income_125k, c_tax_perc_list)
inc_more_125k_sum = ctax_income_sum(income_more_125k, c_tax_perc_list)

#buckets for aggregrate percentages
inc_less_25k_perc = ctax_income_perc(income_less_25k, c_tax_perc_list)
inc_40k_perc = ctax_income_perc(income_40k, c_tax_perc_list)
inc_50k_perc = ctax_income_perc(income_50k, c_tax_perc_list)
inc_60k_perc = ctax_income_perc(income_60k, c_tax_perc_list)
inc_70k_perc = ctax_income_perc(income_70k, c_tax_perc_list)
inc_80k_perc = ctax_income_perc(income_80k, c_tax_perc_list)
inc_90k_perc = ctax_income_perc(income_90k, c_tax_perc_list)
inc_100k_perc = ctax_income_perc(income_100k, c_tax_perc_list)
inc_125k_perc = ctax_income_perc(income_125k, c_tax_perc_list)
inc_more_125k_perc = ctax_income_perc(income_more_125k, c_tax_perc_list)



In [173]:

    
#Split levels for legend bar on top
split_list = [1/6.,1/6.,1/6.,1/6.,1/6.,1/6.]
a_list = [round(x*100,2) for x in split_list]



In [174]:

    
#### /////////  Bar Graph for Percentages  ///////// ####

top_labels = ['<0.5%','0.5-1.0%', '1.0-1.50%', '1.5-2.0%', '2.0-3.0%', '>3%']

colors = ['#77AF9C','#4ea1d3','#9055A2','#CE6D39', '#FFBC42','#D81159']

x_data = [inc_less_25k_perc, inc_40k_perc, inc_50k_perc, inc_60k_perc, inc_70k_perc, inc_80k_perc, inc_90k_perc, 
          inc_100k_perc, inc_125k_perc, inc_more_125k_perc, a_list]

y_data = ['Less than $25k','$25-40k','$40-50k','$50-60k','$60-70k','$70-80k','$80-90k','$90-100k','$100-125k',
          'More than $125k','Carbon Tax as % of Income']

traces = []

for i in range(0, len(x_data[0])):
    for xd, yd in zip(x_data, y_data):
        traces.append(go.Bar(
            x=xd[i],
            y=yd,
            orientation='h',
            marker=dict(
                color=colors[i],
                line=dict(
                        color='rgb(248, 248, 249)',
                        width=1))))

layout = go.Layout(
    xaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
        domain=[0.15, 1]
    ),
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
    ),
    barmode='stack',
    plot_bgcolor='rgb(248, 248, 255)',
    margin=dict(
        l=5,
        r=5,
        t=70,
        b=70
    ),
    showlegend=False,)

annotations = []

for yd, xd in zip(y_data, x_data):
    # labeling the y-axis
    annotations.append(dict(xref='paper', yref='y',
                            x=0.14, y=yd,
                            xanchor='right',
                            text=str(yd),
                            font=dict(family='Arial', size=14,
                                      color='rgb(67, 67, 67)'),
                            showarrow=False, align='right'))
    # labeling the first percentage of each bar (x_axis)
    annotations.append(dict(xref='x', yref='y',
                            x=xd[0] / 2, y=yd,
                            text=str(xd[0]) + '%',
                            font=dict(family='Arial', size=10,
                                      color='rgb(248, 248, 255)'),
                            showarrow=False))
    # labeling the first Likert scale (on the top)
    if yd == y_data[-1]:
        annotations.append(dict(xref='x', yref='paper',
                                x=xd[0] / 2, y=1.1,
                                text=top_labels[0],
                                font=dict(family='Arial', size=13,
                                          color='rgb(67, 67, 67)'),
                                showarrow=False))
    space = xd[0]
    for i in range(1, len(xd)):
            # labeling the rest of percentages for each bar (x_axis)
            annotations.append(dict(xref='x', yref='y',
                                    x=space + (xd[i]/2), y=yd, 
                                    text=str(xd[i]) + '%',
                                    font=dict(family='Arial', size=10,
                                              color='rgb(248, 248, 255)'),
                                    showarrow=False))
            # labeling the Likert scale
            if yd == y_data[-1]:
                annotations.append(dict(xref='x', yref='paper',
                                        x=space + (xd[i]/2), y=1.1,
                                        text=top_labels[i],
                                        font=dict(family='Arial', size=13,
                                                  color='rgb(67, 67, 67)'),
                                        showarrow=False))
            space += xd[i]

layout['annotations'] = annotations

fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename='income-percentage')









    Out[174]:



In [176]:

    
#sizing legend bar by the largest category 
split_list = [1/6.,1/6.,1/6.,1/6.,1/6.,1/6.]
b_list = [int(round(x*sum(inc_50k_sum),0)) for x in split_list]



In [177]:

    
#### /////////  Bar Graph for Aggregrate Populations  ///////// ####

top_labels = ['<0.5%','0.5-1.0%', '1.0-1.50%', '1.5-2.0%', '2.0-3.0%', '>3%']

colors = ['#77AF9C','#4ea1d3','#9055A2','#CE6D39', '#FFBC42','#D81159']

x_data = [inc_less_25k_sum, inc_40k_sum, inc_50k_sum, inc_60k_sum, inc_70k_sum, inc_80k_sum, inc_90k_sum, inc_100k_sum, 
          inc_125k_sum, inc_more_125k_sum, b_list]

y_data = ['Less than $25k','$25-40k','$40-50k','$50-60k','$60-70k','$70-80k','$80-90k','$90-100k','$100-125k',
          'More than $125k','Carbon Tax as % of Income']

traces = []

for i in range(0, len(x_data[0])):
    for xd, yd in zip(x_data, y_data):
        traces.append(go.Bar(
            x=xd[i],
            y=yd,
            orientation='h',
            marker=dict(
                color=colors[i],
                line=dict(
                        color='rgb(248, 248, 249)',
                        width=1))))

layout = go.Layout(
    xaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
        domain=[0.15, 1]
    ),
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
    ),
    barmode='stack',
    plot_bgcolor='rgb(248, 248, 255)',
    margin=dict(
        l=5,
        r=5,
        t=70,
        b=70
    ),
    showlegend=False,)

annotations = []

for yd, xd in zip(y_data, x_data):
    # labeling the y-axis
    annotations.append(dict(xref='paper', yref='y',
                            x=0.14, y=yd,
                            xanchor='right',
                            text=str(yd),
                            font=dict(family='Arial', size=14,
                                      color='rgb(67, 67, 67)'),
                            showarrow=False, align='right'))
    # labeling the first percentage of each bar (x_axis)
    annotations.append(dict(xref='x', yref='y',
                            x=xd[0] / 2, y=yd,
                            text=str(xd[0]/1000) + 'k',
                            font=dict(family='Arial', size=10,
                                      color='rgb(248, 248, 255)'),
                            showarrow=False))
    # labeling the first Likert scale (on the top)
    if yd == y_data[-1]:
        annotations.append(dict(xref='x', yref='paper',
                                x=xd[0] / 2, y=1.1,
                                text=top_labels[0],
                                font=dict(family='Arial', size=13,
                                          color='rgb(67, 67, 67)'),
                                showarrow=False))
    space = xd[0]
    for i in range(1, len(xd)):
            # labeling the rest of percentages for each bar (x_axis)
            annotations.append(dict(xref='x', yref='y',
                                    x=space + (xd[i]/2), y=yd, 
                                    text=str(xd[i]/1000) + 'k',
                                    font=dict(family='Arial', size=10,
                                              color='rgb(248, 248, 255)'),
                                    showarrow=False))
            # labeling the Likert scale
            if yd == y_data[-1]:
                annotations.append(dict(xref='x', yref='paper',
                                        x=space + (xd[i]/2), y=1.1,
                                        text=top_labels[i],
                                        font=dict(family='Arial', size=13,
                                                  color='rgb(67, 67, 67)'),
                                        showarrow=False))
            space += xd[i]

layout['annotations'] = annotations

fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename='income-percentage')









    Out[177]:

Distribution Plots



In [188]:

    
import plotly.plotly as py
from plotly.tools import FigureFactory as FF



In [ ]:

    
df_carb_ton = df_carbon['z_carb_ton_ph'].dropna()



In [165]:

    
# Caron Ton / Household PDF

fig = FF.create_distplot([df_carb_ton], [''],show_rug=False,bin_size=.05)
fig['layout'].update(title='Distibution Plot - Carbon Tons Per Household', 
                     width= 500, height = 300,
                        yaxis=dict(autotick=False,
                                   ticks='outside',
                                   tick0=0,
                                   dtick=0.02,
                                   ticklen=.5,
                                   tickwidth=1,
                                   tickcolor='#000',
                                  ))

py.iplot(fig,filename='distplot-carbon-tax',validate=False)









    Out[165]:



In [166]:

    
# CTax/Capita PDF

df_carb_tax =  df_carbon['carbon_tax_per_capita'].ix[df_carbon['carbon_tax_per_capita'] <10000].dropna()

fig = FF.create_distplot([df_carb_tax], [''],show_rug=False,bin_size=100)
fig['layout'].update(title='Distribution Plot - Carbon Tax Per Capita', 
                     width= 600, height = 400)

py.iplot(fig,filename='distplot-carbon-tax',validate=False)









    Out[166]:



In [167]:

    
#CTax/Income

df_tax_income = df_carbon['tax_per_income'].ix[df_carbon['tax_per_income'] <15].dropna()

fig = FF.create_distplot([df_tax_income], [''],show_rug=False,bin_size=.05)
fig['layout'].update(title='Distribution Plot - Carbon Tax as % of Median Income', 
                     width= 600, height = 400)

py.iplot(fig,filename='distplot-carbon-tax',validate=False)









    Out[167]:

Random Forest Feature Importance Analysis



In [ ]:

    
from sklearn.ensemble import RandomForestClassifier



In [189]:

    
# Create categories from CTax/Income

def categorize(x):
    if 0 < x <= .5:
        return int(1)
    if .5 < x <= 1.0:
        return int(2)
    if 1.0 < x <= 1.5:
        return int(3)
    if 1.5 < x <= 2.0:
        return int(4)
    if 2.0 < x <= 3.0:
        return int(5)
    
    else:
        return int(6)

df_carbon['ctax_class'] = df_carbon['tax_per_income'].apply(categorize)



In [190]:

    
df_carbon.describe()









    Out[190]:






  
    
      
      z_lat_d
      z_long_d
      z_land
      z_water
      z_pop
      z_households
      z_comm
      z_med_inc
      z_house_unit
      z_comm_miles
      z_comm_miles_ph
      z_carb_ton_ph
      z_pov
      z_per_comm
      z_carbon_tons
      total_carbon_tax
      carbon_tax_per_capita
      carbon_tax_per_house
      tax_per_income
      ctax_class
    
  
  
    
      count
      26170.000000
      26170.000000
      26170.000000
      26170.000000
      26170.000000
      26170.000000
      26170.000000
      24709.00000
      26170.000000
      2.617000e+04
      26031.000000
      26031.000000
      25548.000000
      25758.000000
      26031.000000
      26031.000000
      26031.000000
      26031.000000
      24708.000000
      26170.000000
    
    
      mean
      38.325589
      -88.969161
      52.803052
      1.253387
      11308.166794
      4271.586855
      2826.670462
      55545.24740
      4773.933053
      4.341181e+07
      19028.372806
      7.902110
      11.037940
      78.700978
      30729.244942
      4302094.291817
      544.136045
      1106.295368
      1.156416
      2.840352
    
    
      std
      4.924491
      14.044077
      83.886523
      4.402078
      14713.765187
      5468.304529
      3678.442110
      23311.76608
      5967.497131
      5.732325e+07
      7298.145215
      3.030776
      10.359998
      13.517806
      38626.082451
      5407651.543104
      504.534513
      424.308706
      1.020256
      1.544071
    
    
      min
      17.963613
      -158.185151
      0.002000
      0.000000
      0.000000
      0.000000
      0.000000
      3479.00000
      0.000000
      0.000000e+00
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.000000
    
    
      25%
      35.067059
      -94.930508
      6.954000
      0.018000
      1119.250000
      413.000000
      279.250000
      40659.00000
      521.250000
      5.581571e+06
      13632.368576
      5.661255
      4.200000
      75.500000
      4481.615082
      627426.111442
      322.617067
      792.575717
      0.530283
      2.000000
    
    
      50%
      39.320292
      -85.841082
      27.855500
      0.194000
      4348.000000
      1625.500000
      1086.500000
      50923.00000
      1931.500000
      1.932455e+07
      19653.436688
      8.161687
      8.500000
      81.300000
      14803.891124
      2072544.757309
      491.646494
      1142.636115
      0.941361
      2.000000
    
    
      75%
      41.697514
      -78.917872
      66.745500
      0.891000
      17115.250000
      6629.750000
      4278.750000
      64732.00000
      7449.000000
      6.003442e+07
      24886.945377
      10.335060
      14.800000
      85.800000
      43335.972976
      6067036.216633
      671.301651
      1446.908397
      1.505808
      4.000000
    
    
      max
      65.390183
      -65.295977
      3529.043000
      248.932000
      113916.000000
      43456.000000
      28479.000000
      236500.00000
      47617.000000
      5.225270e+08
      32496.470500
      13.495146
      100.000000
      100.000000
      349412.195861
      48917707.420608
      26182.854553
      1889.320498
      42.747052
      6.000000



In [192]:

    
#define new df for Random Forest Classifier
df = df_carbon[['z_pop', 'z_land','z_comm_miles','z_comm_miles_ph','z_house_unit','z_carb_ton_ph','z_carbon_tons','total_carbon_tax','z_pov']]

labels = df_carbon['ctax_class']

#Fill NaNs with mean
df = df.apply(lambda x:x.fillna(x.value_counts().index[0]))



In [193]:

    
def feature_importance(X, y):
    y_train = y
    model = RandomForestClassifier(n_estimators=25)
    clf = model.fit(X, y_train)

    feature_importance = clf.feature_importances_.tolist()
    features = X.columns.tolist()
    
    df_features = pd.DataFrame( {'Feature_Importance':feature_importance,'Features':features})

    df_features = df_features[['Features','Feature_Importance']].sort_values('Feature_Importance', ascending=False)
    
    return df_features

#Run function and create feature importance dataframe
df_features = feature_importance(df, labels)



In [194]:

    
#Create easier to read column names 
feature_importance_list = ['Poverty Rate','CTon/Household','Commuter Miles/House','Population','# of House Units',
                           'Commuter Miles','Land Area','Total Carbon Tax','Carbon Tons']



In [198]:

    
import plotly.plotly as py
import plotly.graph_objs as go

data = [go.Bar(
            x= feature_importance_list,
            y= df_features['Feature_Importance'].tolist()
    )]

py.iplot(data, filename='basic-bar')









    Out[198]:



In [ ]:

	z_lat_d	z_long_d	z_land	z_water	z_pop	z_households	z_comm	z_med_inc	z_house_unit	z_comm_miles	z_comm_miles_ph	z_carb_ton_ph	z_id	z_pov	z_per_comm
count	26170.000000	26170.000000	26170.000000	26170.000000	26170.000000	26170.000000	26170.000000	24709.00000	26170.000000	2.617000e+04	26031.000000	26031.000000	26170.000000	25548.000000	25758.000000
mean	38.325589	-88.969161	52.803052	1.253387	11308.166794	4271.586855	2826.670462	55545.24740	4773.933053	4.341181e+07	19028.372806	7.902110	46426.915705	11.037940	78.700978
std	4.924491	14.044077	83.886523	4.402078	14713.765187	5468.304529	3678.442110	23311.76608	5967.497131	5.732325e+07	7298.145215	3.030776	27645.920808	10.359998	13.517806
min	17.963613	-158.185151	0.002000	0.000000	0.000000	0.000000	0.000000	3479.00000	0.000000	0.000000e+00	0.000000	0.000000	601.000000	0.000000	0.000000
25%	35.067059	-94.930508	6.954000	0.018000	1119.250000	413.000000	279.250000	40659.00000	521.250000	5.581571e+06	13632.368576	5.661255	23047.750000	4.200000	75.500000
50%	39.320292	-85.841082	27.855500	0.194000	4348.000000	1625.500000	1086.500000	50923.00000	1931.500000	1.932455e+07	19653.436688	8.161687	45134.000000	8.500000	81.300000
75%	41.697514	-78.917872	66.745500	0.891000	17115.250000	6629.750000	4278.750000	64732.00000	7449.000000	6.003442e+07	24886.945377	10.335060	68742.500000	14.800000	85.800000
max	65.390183	-65.295977	3529.043000	248.932000	113916.000000	43456.000000	28479.000000	236500.00000	47617.000000	5.225270e+08	32496.470500	13.495146	99790.000000	100.000000	100.000000