Assignment 4
Using data from this FiveThirtyEight post, write code to calculate the correlation of the responses from the poll.
Respond to the story in your PR. Is this a good example of data journalism? Why or why not?
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.formula.api as smf
In [54]:
iran_df=pd.read_csv('/home/sean/Documents/iran_polls.csv')
In [55]:
obama_df=pd.read_csv('/home/sean/Documents/obama_polls.csv')
In [56]:
iran_df
Out[56]:
Voter
Approve
Disapprove
DK
Approve.1
Disapprove.1
DK.1
Approve.2
Disapprove.2
DK.2
0
Total
0.58
0.33
0.09
0.47
0.43
0.10
0.47
0.43
0.10
1
Republican
0.37
0.56
0.07
0.34
0.60
0.06
0.34
0.60
0.06
2
Democrat
0.76
0.15
0.08
0.60
0.26
0.14
0.60
0.26
0.14
3
Independant
0.60
0.33
0.07
0.44
0.49
0.07
0.44
0.49
0.07
4
Men
0.55
0.39
0.06
0.46
0.48
0.06
0.46
0.48
0.06
5
Women
0.61
0.28
0.11
0.47
0.39
0.14
0.47
0.39
0.14
6
Under 35
0.66
0.24
0.11
0.51
0.37
0.12
0.51
0.37
0.12
7
35-54
0.60
0.33
0.07
0.46
0.47
0.07
0.46
0.47
0.07
8
55 plus
0.54
0.37
0.09
0.45
0.44
0.11
0.45
0.44
0.11
In [57]:
obama_df
Out[57]:
Voter
Approve
Disapprove
DK
Approve.1
Disapprove.1
DK.1
Approve.2
Disapprove.2
DK.2
0
Total
0.44
0.49
0.07
0.44
0.50
0.06
0.44
0.50
0.06
1
Republican
0.06
0.91
0.03
0.10
0.86
0.04
0.10
0.86
0.04
2
Democrat
0.84
0.13
0.03
0.78
0.17
0.05
0.78
0.17
0.05
3
Independant
0.40
0.51
0.09
0.37
0.53
0.10
0.37
0.53
0.10
4
Men
0.39
0.55
0.05
0.41
0.53
0.05
0.41
0.53
0.05
5
Women
0.48
0.44
0.08
0.47
0.47
0.06
0.47
0.47
0.06
6
Under 35
0.49
0.41
0.11
0.56
0.36
0.08
0.56
0.36
0.08
7
35-54
0.48
0.45
0.06
0.35
0.60
0.04
0.35
0.60
0.04
8
55 plus
0.39
0.57
0.04
0.45
0.50
0.05
0.45
0.50
0.05
In [58]:
combined=pd.merge(left=obama_df, right=iran_df, how='outer')
In [59]:
combined
Out[59]:
Voter
Approve
Disapprove
DK
Approve.1
Disapprove.1
DK.1
Approve.2
Disapprove.2
DK.2
0
Total
0.44
0.49
0.07
0.44
0.50
0.06
0.44
0.50
0.06
1
Republican
0.06
0.91
0.03
0.10
0.86
0.04
0.10
0.86
0.04
2
Democrat
0.84
0.13
0.03
0.78
0.17
0.05
0.78
0.17
0.05
3
Independant
0.40
0.51
0.09
0.37
0.53
0.10
0.37
0.53
0.10
4
Men
0.39
0.55
0.05
0.41
0.53
0.05
0.41
0.53
0.05
5
Women
0.48
0.44
0.08
0.47
0.47
0.06
0.47
0.47
0.06
6
Under 35
0.49
0.41
0.11
0.56
0.36
0.08
0.56
0.36
0.08
7
35-54
0.48
0.45
0.06
0.35
0.60
0.04
0.35
0.60
0.04
8
55 plus
0.39
0.57
0.04
0.45
0.50
0.05
0.45
0.50
0.05
9
Total
0.58
0.33
0.09
0.47
0.43
0.10
0.47
0.43
0.10
10
Republican
0.37
0.56
0.07
0.34
0.60
0.06
0.34
0.60
0.06
11
Democrat
0.76
0.15
0.08
0.60
0.26
0.14
0.60
0.26
0.14
12
Independant
0.60
0.33
0.07
0.44
0.49
0.07
0.44
0.49
0.07
13
Men
0.55
0.39
0.06
0.46
0.48
0.06
0.46
0.48
0.06
14
Women
0.61
0.28
0.11
0.47
0.39
0.14
0.47
0.39
0.14
15
Under 35
0.66
0.24
0.11
0.51
0.37
0.12
0.51
0.37
0.12
16
35-54
0.60
0.33
0.07
0.46
0.47
0.07
0.46
0.47
0.07
17
55 plus
0.54
0.37
0.09
0.45
0.44
0.11
0.45
0.44
0.11
In [63]:
combined.plot(kind='scatter', x='Approve', y='Voter')
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.5/dist-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
1944 try:
-> 1945 return self._engine.get_loc(key)
1946 except KeyError:
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4154)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4018)()
pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12368)()
pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12322)()
KeyError: 'Voter'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-63-c111ec1df4d6> in <module>()
----> 1 combined.plot(kind='scatter', x='Approve', y='Voter')
/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in __call__(self, x, y, kind, ax, subplots, sharex, sharey, layout, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, secondary_y, sort_columns, **kwds)
3738 fontsize=fontsize, colormap=colormap, table=table,
3739 yerr=yerr, xerr=xerr, secondary_y=secondary_y,
-> 3740 sort_columns=sort_columns, **kwds)
3741 __call__.__doc__ = plot_frame.__doc__
3742
/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in plot_frame(data, x, y, kind, ax, subplots, sharex, sharey, layout, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, secondary_y, sort_columns, **kwds)
2612 yerr=yerr, xerr=xerr,
2613 secondary_y=secondary_y, sort_columns=sort_columns,
-> 2614 **kwds)
2615
2616
/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in _plot(data, x, y, subplots, ax, kind, **kwds)
2439 plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
2440
-> 2441 plot_obj.generate()
2442 plot_obj.draw()
2443 return plot_obj.result
/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in generate(self)
1026 self._compute_plot_data()
1027 self._setup_subplots()
-> 1028 self._make_plot()
1029 self._add_table()
1030 self._make_legend()
/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in _make_plot(self)
1598 else:
1599 label = None
-> 1600 scatter = ax.scatter(data[x].values, data[y].values, c=c_values,
1601 label=label, cmap=cmap, **self.kwds)
1602 if cb:
/usr/local/lib/python3.5/dist-packages/pandas/core/frame.py in __getitem__(self, key)
1995 return self._getitem_multilevel(key)
1996 else:
-> 1997 return self._getitem_column(key)
1998
1999 def _getitem_column(self, key):
/usr/local/lib/python3.5/dist-packages/pandas/core/frame.py in _getitem_column(self, key)
2002 # get column
2003 if self.columns.is_unique:
-> 2004 return self._get_item_cache(key)
2005
2006 # duplicate columns & possible reduce dimensionality
/usr/local/lib/python3.5/dist-packages/pandas/core/generic.py in _get_item_cache(self, item)
1348 res = cache.get(item)
1349 if res is None:
-> 1350 values = self._data.get(item)
1351 res = self._box_item_values(item, values)
1352 cache[item] = res
/usr/local/lib/python3.5/dist-packages/pandas/core/internals.py in get(self, item, fastpath)
3288
3289 if not isnull(item):
-> 3290 loc = self.items.get_loc(item)
3291 else:
3292 indexer = np.arange(len(self.items))[isnull(self.items)]
/usr/local/lib/python3.5/dist-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
1945 return self._engine.get_loc(key)
1946 except KeyError:
-> 1947 return self._engine.get_loc(self._maybe_cast_indexer(key))
1948
1949 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4154)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4018)()
pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12368)()
pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12322)()
KeyError: 'Voter'
In [38]:
plt.scatter(x=[iran_favor['Republican']], y=[obama_approve['Republican']])
Out[38]:
<matplotlib.collections.PathCollection at 0x7f47d9772470>
In [ ]:
Content source: ledeprogram/algorithms
Similar notebooks: