In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_excel("2013_NYC_CD_MedianIncome_Recycle.xlsx")

In [4]:
df.head()


Out[4]:
CD_Name MdHHIncE RecycleRate
0 Battery Park City, Greenwich Village & Soho 119596 0.286771
1 Battery Park City, Greenwich Village & Soho 119596 0.264074
2 Chinatown & Lower East Side 40919 0.156485
3 Chelsea, Clinton & Midtown Business Distric 92583 0.235125
4 Chelsea, Clinton & Midtown Business Distric 92583 0.246725

In [5]:
df['MdHHIncE'].describe()


Out[5]:
count        59.000000
mean      53895.932203
std       24371.741796
min       21318.000000
25%       37950.000000
50%       48252.000000
75%       61967.000000
max      119596.000000
Name: MdHHIncE, dtype: float64

In [12]:
df['MdHHIncE'].median()


Out[12]:
48252.0

In [14]:
df['MdHHIncE'].mode()


Out[14]:
0     21318
1     22343
2     51251
3     92583
4    119596
dtype: int64

In [23]:
range=df['MdHHIncE'].max()-df['MdHHIncE'].min()
range


Out[23]:
98278

In [15]:
df['MdHHIncE'].quantile(q=0.75)


Out[15]:
61967.0

In [17]:
df['MdHHIncE'].quantile(q=0.25)


Out[17]:
37950.0

In [16]:
df['MdHHIncE'].quantile(q=0.5) #is the same as median


Out[16]:
48252.0

In [19]:
Inter_quartile_Range=df['MdHHIncE'].quantile(q=0.75) - df['MdHHIncE'].quantile(q=0.25)
Inter_quartile_Range


Out[19]:
24017.0

In [21]:
Identification_of_outlier=1.5*Inter_quartile_Range
Identification_of_outlier


Out[21]:
36025.5

In [7]:
df['RecycleRate'].describe()


Out[7]:
count    59.000000
mean      0.175569
std       0.051499
min       0.091464
25%       0.133510
50%       0.174876
75%       0.212835
max       0.302798
Name: RecycleRate, dtype: float64

In [10]:
df.plot(kind='scatter', x='MdHHIncE', y='RecycleRate')
plt.xlabel('Median Income')
plt.ylabel('Recycling Rate')


Out[10]:
<matplotlib.text.Text at 0x11163dc88>

In [11]:
df.corr()


Out[11]:
MdHHIncE RecycleRate
MdHHIncE 1.000000 0.884783
RecycleRate 0.884783 1.000000

In [ ]: