Assignment 4

Using data from this FiveThirtyEight post, write code to calculate the correlation of the responses from the poll.
Respond to the story in your PR. Is this a good example of data journalism? Why or why not?



In [1]:

    
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.formula.api as smf



In [54]:

    
iran_df=pd.read_csv('/home/sean/Documents/iran_polls.csv')



In [55]:

    
obama_df=pd.read_csv('/home/sean/Documents/obama_polls.csv')



In [56]:

    
iran_df









    Out[56]:






  
    
      
      Voter
      Approve
      Disapprove
      DK
      Approve.1
      Disapprove.1
      DK.1
      Approve.2
      Disapprove.2
      DK.2
    
  
  
    
      0
      Total
      0.58
      0.33
      0.09
      0.47
      0.43
      0.10
      0.47
      0.43
      0.10
    
    
      1
      Republican
      0.37
      0.56
      0.07
      0.34
      0.60
      0.06
      0.34
      0.60
      0.06
    
    
      2
      Democrat
      0.76
      0.15
      0.08
      0.60
      0.26
      0.14
      0.60
      0.26
      0.14
    
    
      3
      Independant
      0.60
      0.33
      0.07
      0.44
      0.49
      0.07
      0.44
      0.49
      0.07
    
    
      4
      Men
      0.55
      0.39
      0.06
      0.46
      0.48
      0.06
      0.46
      0.48
      0.06
    
    
      5
      Women
      0.61
      0.28
      0.11
      0.47
      0.39
      0.14
      0.47
      0.39
      0.14
    
    
      6
      Under 35
      0.66
      0.24
      0.11
      0.51
      0.37
      0.12
      0.51
      0.37
      0.12
    
    
      7
      35-54
      0.60
      0.33
      0.07
      0.46
      0.47
      0.07
      0.46
      0.47
      0.07
    
    
      8
      55 plus
      0.54
      0.37
      0.09
      0.45
      0.44
      0.11
      0.45
      0.44
      0.11



In [57]:

    
obama_df









    Out[57]:






  
    
      
      Voter
      Approve
      Disapprove
      DK
      Approve.1
      Disapprove.1
      DK.1
      Approve.2
      Disapprove.2
      DK.2
    
  
  
    
      0
      Total
      0.44
      0.49
      0.07
      0.44
      0.50
      0.06
      0.44
      0.50
      0.06
    
    
      1
      Republican
      0.06
      0.91
      0.03
      0.10
      0.86
      0.04
      0.10
      0.86
      0.04
    
    
      2
      Democrat
      0.84
      0.13
      0.03
      0.78
      0.17
      0.05
      0.78
      0.17
      0.05
    
    
      3
      Independant
      0.40
      0.51
      0.09
      0.37
      0.53
      0.10
      0.37
      0.53
      0.10
    
    
      4
      Men
      0.39
      0.55
      0.05
      0.41
      0.53
      0.05
      0.41
      0.53
      0.05
    
    
      5
      Women
      0.48
      0.44
      0.08
      0.47
      0.47
      0.06
      0.47
      0.47
      0.06
    
    
      6
      Under 35
      0.49
      0.41
      0.11
      0.56
      0.36
      0.08
      0.56
      0.36
      0.08
    
    
      7
      35-54
      0.48
      0.45
      0.06
      0.35
      0.60
      0.04
      0.35
      0.60
      0.04
    
    
      8
      55 plus
      0.39
      0.57
      0.04
      0.45
      0.50
      0.05
      0.45
      0.50
      0.05



In [58]:

    
combined=pd.merge(left=obama_df, right=iran_df, how='outer')



In [59]:

    
combined









    Out[59]:






  
    
      
      Voter
      Approve
      Disapprove
      DK
      Approve.1
      Disapprove.1
      DK.1
      Approve.2
      Disapprove.2
      DK.2
    
  
  
    
      0
      Total
      0.44
      0.49
      0.07
      0.44
      0.50
      0.06
      0.44
      0.50
      0.06
    
    
      1
      Republican
      0.06
      0.91
      0.03
      0.10
      0.86
      0.04
      0.10
      0.86
      0.04
    
    
      2
      Democrat
      0.84
      0.13
      0.03
      0.78
      0.17
      0.05
      0.78
      0.17
      0.05
    
    
      3
      Independant
      0.40
      0.51
      0.09
      0.37
      0.53
      0.10
      0.37
      0.53
      0.10
    
    
      4
      Men
      0.39
      0.55
      0.05
      0.41
      0.53
      0.05
      0.41
      0.53
      0.05
    
    
      5
      Women
      0.48
      0.44
      0.08
      0.47
      0.47
      0.06
      0.47
      0.47
      0.06
    
    
      6
      Under 35
      0.49
      0.41
      0.11
      0.56
      0.36
      0.08
      0.56
      0.36
      0.08
    
    
      7
      35-54
      0.48
      0.45
      0.06
      0.35
      0.60
      0.04
      0.35
      0.60
      0.04
    
    
      8
      55 plus
      0.39
      0.57
      0.04
      0.45
      0.50
      0.05
      0.45
      0.50
      0.05
    
    
      9
      Total
      0.58
      0.33
      0.09
      0.47
      0.43
      0.10
      0.47
      0.43
      0.10
    
    
      10
      Republican
      0.37
      0.56
      0.07
      0.34
      0.60
      0.06
      0.34
      0.60
      0.06
    
    
      11
      Democrat
      0.76
      0.15
      0.08
      0.60
      0.26
      0.14
      0.60
      0.26
      0.14
    
    
      12
      Independant
      0.60
      0.33
      0.07
      0.44
      0.49
      0.07
      0.44
      0.49
      0.07
    
    
      13
      Men
      0.55
      0.39
      0.06
      0.46
      0.48
      0.06
      0.46
      0.48
      0.06
    
    
      14
      Women
      0.61
      0.28
      0.11
      0.47
      0.39
      0.14
      0.47
      0.39
      0.14
    
    
      15
      Under 35
      0.66
      0.24
      0.11
      0.51
      0.37
      0.12
      0.51
      0.37
      0.12
    
    
      16
      35-54
      0.60
      0.33
      0.07
      0.46
      0.47
      0.07
      0.46
      0.47
      0.07
    
    
      17
      55 plus
      0.54
      0.37
      0.09
      0.45
      0.44
      0.11
      0.45
      0.44
      0.11



In [63]:

    
combined.plot(kind='scatter', x='Approve', y='Voter')









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/usr/local/lib/python3.5/dist-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   1944             try:
-> 1945                 return self._engine.get_loc(key)
   1946             except KeyError:

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4154)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4018)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12368)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12322)()

KeyError: 'Voter'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-63-c111ec1df4d6> in <module>()
----> 1 combined.plot(kind='scatter', x='Approve', y='Voter')

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in __call__(self, x, y, kind, ax, subplots, sharex, sharey, layout, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, secondary_y, sort_columns, **kwds)
   3738                           fontsize=fontsize, colormap=colormap, table=table,
   3739                           yerr=yerr, xerr=xerr, secondary_y=secondary_y,
-> 3740                           sort_columns=sort_columns, **kwds)
   3741     __call__.__doc__ = plot_frame.__doc__
   3742 

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in plot_frame(data, x, y, kind, ax, subplots, sharex, sharey, layout, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, secondary_y, sort_columns, **kwds)
   2612                  yerr=yerr, xerr=xerr,
   2613                  secondary_y=secondary_y, sort_columns=sort_columns,
-> 2614                  **kwds)
   2615 
   2616 

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in _plot(data, x, y, subplots, ax, kind, **kwds)
   2439         plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
   2440 
-> 2441     plot_obj.generate()
   2442     plot_obj.draw()
   2443     return plot_obj.result

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in generate(self)
   1026         self._compute_plot_data()
   1027         self._setup_subplots()
-> 1028         self._make_plot()
   1029         self._add_table()
   1030         self._make_legend()

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in _make_plot(self)
   1598         else:
   1599             label = None
-> 1600         scatter = ax.scatter(data[x].values, data[y].values, c=c_values,
   1601                              label=label, cmap=cmap, **self.kwds)
   1602         if cb:

/usr/local/lib/python3.5/dist-packages/pandas/core/frame.py in __getitem__(self, key)
   1995             return self._getitem_multilevel(key)
   1996         else:
-> 1997             return self._getitem_column(key)
   1998 
   1999     def _getitem_column(self, key):

/usr/local/lib/python3.5/dist-packages/pandas/core/frame.py in _getitem_column(self, key)
   2002         # get column
   2003         if self.columns.is_unique:
-> 2004             return self._get_item_cache(key)
   2005 
   2006         # duplicate columns & possible reduce dimensionality

/usr/local/lib/python3.5/dist-packages/pandas/core/generic.py in _get_item_cache(self, item)
   1348         res = cache.get(item)
   1349         if res is None:
-> 1350             values = self._data.get(item)
   1351             res = self._box_item_values(item, values)
   1352             cache[item] = res

/usr/local/lib/python3.5/dist-packages/pandas/core/internals.py in get(self, item, fastpath)
   3288 
   3289             if not isnull(item):
-> 3290                 loc = self.items.get_loc(item)
   3291             else:
   3292                 indexer = np.arange(len(self.items))[isnull(self.items)]

/usr/local/lib/python3.5/dist-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   1945                 return self._engine.get_loc(key)
   1946             except KeyError:
-> 1947                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   1948 
   1949         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4154)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4018)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12368)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12322)()

KeyError: 'Voter'



In [38]:

    
plt.scatter(x=[iran_favor['Republican']], y=[obama_approve['Republican']])









    Out[38]:





<matplotlib.collections.PathCollection at 0x7f47d9772470>



In [ ]:

	Voter	Approve	Disapprove	DK	Approve.1	Disapprove.1	DK.1	Approve.2	Disapprove.2	DK.2
0	Total	0.58	0.33	0.09	0.47	0.43	0.10	0.47	0.43	0.10
1	Republican	0.37	0.56	0.07	0.34	0.60	0.06	0.34	0.60	0.06
2	Democrat	0.76	0.15	0.08	0.60	0.26	0.14	0.60	0.26	0.14
3	Independant	0.60	0.33	0.07	0.44	0.49	0.07	0.44	0.49	0.07
4	Men	0.55	0.39	0.06	0.46	0.48	0.06	0.46	0.48	0.06
5	Women	0.61	0.28	0.11	0.47	0.39	0.14	0.47	0.39	0.14
6	Under 35	0.66	0.24	0.11	0.51	0.37	0.12	0.51	0.37	0.12
7	35-54	0.60	0.33	0.07	0.46	0.47	0.07	0.46	0.47	0.07
8	55 plus	0.54	0.37	0.09	0.45	0.44	0.11	0.45	0.44	0.11