Using the 2013_NYC_CD_MedianIncome_Recycle.xlsx file, calculate the correlation between the recycling rate and the median income. Discuss your findings in your PR.


In [12]:
import pandas as pd                   # dataframe
import matplotlib.pyplot as plt       # graphs & other analysis
import matplotlib as mpl              # graphics parameters
import numpy as np                    # numerical calculations

In [4]:
df = pd.read_excel("2013_NYC_CD_MedianIncome_Recycle.xlsx")

In [5]:
df.head()


Out[5]:
CD_Name MdHHIncE RecycleRate
0 Battery Park City, Greenwich Village & Soho 119596 0.286771
1 Battery Park City, Greenwich Village & Soho 119596 0.264074
2 Chinatown & Lower East Side 40919 0.156485
3 Chelsea, Clinton & Midtown Business Distric 92583 0.235125
4 Chelsea, Clinton & Midtown Business Distric 92583 0.246725

In [9]:
%matplotlib inline
df.plot(kind='scatter',y='MdHHIncE',x='RecycleRate')
df.corr()


Out[9]:
MdHHIncE RecycleRate
MdHHIncE 1.000000 0.884783
RecycleRate 0.884783 1.000000

Finding : High correlation exist between Median income and Recycle Rate.

Further Analysis


In [30]:
def plot_correlation( ds, x, y, dotcolor, linecolor, ylim=(0,140000) ):
    plt.xlim(.05,.35)
    plt.ylim(ylim[0],ylim[1])
    plt.scatter(ds[x], ds[y], alpha=0.6, s=50, c=dotcolor) 
    plt.xlabel(x)
    plt.ylabel(y)
    
    # Correlation 
    trend_variable = np.poly1d(np.polyfit(ds[x], ds[y], 1))
    trendx = np.linspace(0, 0.35, 4)
    plt.plot(trendx, trend_variable(trendx), color=linecolor) 
    r = sp.stats.pearsonr(ds[x],ds[y])
    plt.text(trendx[3], trend_variable(trendx[3]),'r={:.3f}'.format(r[0]), color = linecolor )
    plt.tight_layout()

In [31]:
import scipy as sp
from scipy import stats

In [32]:
plot_correlation(df,'RecycleRate','MdHHIncE','b','r')



In [ ]: