Using the 2013_NYC_CD_MedianIncome_Recycle.xlsx file, calculate the correlation between the recycling rate and the median income. Discuss your findings in your PR.


In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df=pd.read_excel('/home/sean/git/algorithms/class4/homework/data/2013_NYC_CD_MedianIncome_Recycle.xlsx')

In [6]:
df.head()


Out[6]:
CD_Name MdHHIncE RecycleRate
0 Battery Park City, Greenwich Village & Soho 119596 0.286771
1 Battery Park City, Greenwich Village & Soho 119596 0.264074
2 Chinatown & Lower East Side 40919 0.156485
3 Chelsea, Clinton & Midtown Business Distric 92583 0.235125
4 Chelsea, Clinton & Midtown Business Distric 92583 0.246725

In [9]:
df.plot(kind='scatter', x='MdHHIncE', y='RecycleRate')


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x7facd008f940>

In [14]:
df.corr()


Out[14]:
MdHHIncE RecycleRate
MdHHIncE 1.000000 0.884783
RecycleRate 0.884783 1.000000

In [16]:
# i stole this from http://matthiaseisen.com/pp/patterns/p0170/
fig, ax = plt.subplots()
fit = np.polyfit(df['MdHHIncE'], df['RecycleRate'], deg=1)
ax.plot(df['MdHHIncE'], fit[0] * df['MdHHIncE'] + fit[1], color='red')
ax.scatter(df['MdHHIncE'], df['RecycleRate'])


Out[16]:
<matplotlib.collections.PathCollection at 0x7facca6d1780>

In [ ]: