In [1]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline
import statsmodels.formula.api as smf
df = pd.read_excel('2013_NYC_CD_MedianIncome_Recycle.xlsx')
In [2]:
df.columns = ['Neighborhood', 'Median_Income', 'Recycle_Rate']
df.plot(kind='scatter',x='Median_Income',y='Recycle_Rate')
Out[2]:
In [3]:
df.corr()['Median_Income'].sort_values(ascending=False)
Out[3]:
In [29]:
lm = smf.ols(formula="Recycle_Rate~Median_Income",data=df).fit()
lm.params
Out[29]:
In [30]:
intercept, slope = lm.params
In [31]:
df.plot(kind="scatter",x="Median_Income",y="Recycle_Rate")
plt.plot(df["Median_Income"],slope*df["Median_Income"]+intercept,"-",color="darkgrey")
plt.title('Correlation between income and recycling rate')
plt.xlabel('Median Income ($)')
plt.ylabel('Recycle Rate')
Out[31]:
In [36]:
median_income = int(input('What is the median income of the neighborhood? '))
recycle_rate = slope * median_income + intercept
print('If the neighborhood\'s median income is $' + str(median_income) + ' its recycle rate is probably around ' + str(round(recycle_rate, 2)) + ' percent.')