Assignment 2

Using the 2013_NYC_CD_MedianIncome_Recycle.xlsx file, calculate the correlation between the recycling rate and the median income. Discuss your findings in your PR.


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import statistics
from decimal import Decimal

In [2]:
df = pd.read_excel("2013_NYC_CD_MedianIncome_Recycle.xlsx")

In [3]:
df.head()


Out[3]:
CD_Name MdHHIncE RecycleRate
0 Battery Park City, Greenwich Village & Soho 119596 0.286771
1 Battery Park City, Greenwich Village & Soho 119596 0.264074
2 Chinatown & Lower East Side 40919 0.156485
3 Chelsea, Clinton & Midtown Business Distric 92583 0.235125
4 Chelsea, Clinton & Midtown Business Distric 92583 0.246725

In [4]:
df['MdHHIncE'].describe()


Out[4]:
count        59.000000
mean      53895.932203
std       24371.741796
min       21318.000000
25%       37950.000000
50%       48252.000000
75%       61967.000000
max      119596.000000
Name: MdHHIncE, dtype: float64

In [5]:
df.plot(kind='scatter', x='MdHHIncE', y='RecycleRate')
plt.xlabel('Median Income')
plt.ylabel('Recycling Rate')


Out[5]:
<matplotlib.text.Text at 0x6c95590>

In [6]:
df.corr(method='pearson', min_periods=1)


Out[6]:
MdHHIncE RecycleRate
MdHHIncE 1.000000 0.884783
RecycleRate 0.884783 1.000000

In [7]:
plt.matshow(df.corr())


Out[7]:
<matplotlib.image.AxesImage at 0x7ed92f0>

In [8]:
df['MdHHIncE'].median()


Out[8]:
48252.0

In [9]:
df['MdHHIncE'].mode()


Out[9]:
0     21318
1     22343
2     51251
3     92583
4    119596
dtype: int64

In [10]:
df['MdHHIncE'].quantile(q=0.25) #1st Quartile


Out[10]:
37950.0

In [11]:
df['MdHHIncE'].quantile(q=0.5) #2nd Quartile (Median)


Out[11]:
48252.0

In [12]:
df['MdHHIncE'].quantile(q=0.75) #3rd Quartile


Out[12]:
61967.0

In [13]:
IQR = df['MdHHIncE'].quantile(q=0.75) - df['MdHHIncE'].quantile(q=0.25)
IQR


Out[13]:
24017.0

In [14]:
1.5 * IQR


Out[14]:
36025.5

In [15]:
lower_outliers = (df['MdHHIncE'].quantile(q=0.25))- (IQR * 1.5)

In [16]:
upper_outliers = (df['MdHHIncE'].quantile(q=0.75)) + (IQR * 1.5)

In [21]:
lower_outliers


Out[21]:
1924.5

In [18]:
upper_outliers


Out[18]:
97992.5

In [19]:
df[(df['MdHHIncE'] > upper_outliers)]


Out[19]:
CD_Name MdHHIncE RecycleRate
0 Battery Park City, Greenwich Village & Soho 119596 0.286771
1 Battery Park City, Greenwich Village & Soho 119596 0.264074
5 Murray Hill, Gramercy & Stuyvesant Town 101769 0.222046
7 Upper East Side 104602 0.253719

In [22]:
df[(df['MdHHIncE'] < lower_outliers)]


Out[22]:
CD_Name MdHHIncE RecycleRate

In [ ]: