In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import statistics
from decimal import Decimal
In [2]:
df = pd.read_excel("2013_NYC_CD_MedianIncome_Recycle.xlsx")
In [3]:
df.head()
Out[3]:
In [4]:
df['MdHHIncE'].describe()
Out[4]:
In [5]:
df.plot(kind='scatter', x='MdHHIncE', y='RecycleRate')
plt.xlabel('Median Income')
plt.ylabel('Recycling Rate')
Out[5]:
In [6]:
df.corr(method='pearson', min_periods=1)
Out[6]:
In [7]:
plt.matshow(df.corr())
Out[7]:
In [8]:
df['MdHHIncE'].median()
Out[8]:
In [9]:
df['MdHHIncE'].mode()
Out[9]:
In [10]:
df['MdHHIncE'].quantile(q=0.25) #1st Quartile
Out[10]:
In [11]:
df['MdHHIncE'].quantile(q=0.5) #2nd Quartile (Median)
Out[11]:
In [12]:
df['MdHHIncE'].quantile(q=0.75) #3rd Quartile
Out[12]:
In [13]:
IQR = df['MdHHIncE'].quantile(q=0.75) - df['MdHHIncE'].quantile(q=0.25)
IQR
Out[13]:
In [14]:
1.5 * IQR
Out[14]:
In [15]:
lower_outliers = (df['MdHHIncE'].quantile(q=0.25))- (IQR * 1.5)
In [16]:
upper_outliers = (df['MdHHIncE'].quantile(q=0.75)) + (IQR * 1.5)
In [21]:
lower_outliers
Out[21]:
In [18]:
upper_outliers
Out[18]:
In [19]:
df[(df['MdHHIncE'] > upper_outliers)]
Out[19]:
In [22]:
df[(df['MdHHIncE'] < lower_outliers)]
Out[22]:
In [ ]: