Using the 2013_NYC_CD_MedianIncome_Recycle.xlsx file, calculate the correlation between the recycling rate and the median income. Discuss your findings in your PR.
In [12]:
import pandas as pd # dataframe
import matplotlib.pyplot as plt # graphs & other analysis
import matplotlib as mpl # graphics parameters
import numpy as np # numerical calculations
In [4]:
df = pd.read_excel("2013_NYC_CD_MedianIncome_Recycle.xlsx")
In [5]:
df.head()
Out[5]:
In [9]:
%matplotlib inline
df.plot(kind='scatter',y='MdHHIncE',x='RecycleRate')
df.corr()
Out[9]:
Finding : High correlation exist between Median income and Recycle Rate.
In [30]:
def plot_correlation( ds, x, y, dotcolor, linecolor, ylim=(0,140000) ):
plt.xlim(.05,.35)
plt.ylim(ylim[0],ylim[1])
plt.scatter(ds[x], ds[y], alpha=0.6, s=50, c=dotcolor)
plt.xlabel(x)
plt.ylabel(y)
# Correlation
trend_variable = np.poly1d(np.polyfit(ds[x], ds[y], 1))
trendx = np.linspace(0, 0.35, 4)
plt.plot(trendx, trend_variable(trendx), color=linecolor)
r = sp.stats.pearsonr(ds[x],ds[y])
plt.text(trendx[3], trend_variable(trendx[3]),'r={:.3f}'.format(r[0]), color = linecolor )
plt.tight_layout()
In [31]:
import scipy as sp
from scipy import stats
In [32]:
plot_correlation(df,'RecycleRate','MdHHIncE','b','r')
In [ ]: