In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
Data is generated from Australia Bureau of statistics, some cells are removed (set to NaN) manually in order to serve this notebook.
In [19]:
df = pd.read_csv('./asset/Median Price of Established House Transfers.txt', sep='\t') # row 3 has a null value
df.head()
Out[19]:
In [20]:
# find rows that Price is null
df[pd.isnull(df['Price'])]
Out[20]:
In [21]:
index_with_null = df[pd.isnull(df['Price'])].index
index_with_null
Out[21]:
We can specify a value (e.g., 0) to replace those null values, through the fillna() method
In [22]:
df2 = df.fillna(0) # price value of row 3 is set to 0.0
df2.ix[index_with_null]
# df2.index in index_with_null
Out[22]:
We can also propagate non-null values forward or backward
In [23]:
df2 = df.fillna(method='pad', axis=0)
df2.head() # The price of row 3 is the same as that of row 2
Out[23]:
We can even drop the rows (or columns) with null values
In [24]:
df2 = df.dropna(axis=0) # if axis = 1 then the column will be dropped
df2.head() # Note that row 3 is dropped
Out[24]:
Obviously, none of the above solutions are appropriate.
A better way to deal with the null value is to replace them with the mean value of the prices of the corresponding city over the whole year.
In [25]:
df["Price"] = df.groupby("City").transform(lambda x: x.fillna(x.mean()))
df.ix[index_with_null]
Out[25]:
We use the table with all null values filled
In [26]:
pd.cut(df['Price'],5).head() # equally partition Price into 5 bins
Out[26]:
In [27]:
# We could label the bins and add new column
df['Bin'] = pd.cut(df['Price'],5,labels=["Very Low","Low","Medium","High","Very High"])
df.head()
Out[27]:
In [28]:
pd.qcut(df['Price'],5).head() # Note the difference from the Equal-width Partitioning case
Out[28]:
In [29]:
# Let's check the depth of each bin
df['Bin'] = pd.qcut(df['Price'],5,labels=["Very Low","Low","Medium","High","Very High"])
df.groupby('Bin').size()
Out[29]:
In [30]:
df.head()
Out[30]:
In [31]:
df['Price-Smoothing-mean'] = df.groupby('Bin')['Price'].transform('mean')
df.head()
Out[31]:
In [32]:
df['Price-Smoothing-max'] = df.groupby('Bin')['Price'].transform('max')
df.head()
Out[32]:
In [33]:
df = pd.read_csv('./asset/Median Price of Established House.txt', sep='\t')
df.head()
Out[33]:
In [34]:
from sklearn import preprocessing
min_max_scaler = preprocessing.StandardScaler()
x_scaled = min_max_scaler.fit_transform(df[df.columns[1:5]]) # we need to remove the first column
df_standard = pd.DataFrame(x_scaled)
df_standard.insert(0, 'City', df.City)
df_standard
Out[34]:
In [35]:
from sklearn import preprocessing
min_max_scaler = preprocessing.RobustScaler()
x_scaled = min_max_scaler.fit_transform(df[df.columns[1:5]]) # we need to remove the first column
df_robust = pd.DataFrame(x_scaled)
df_robust.insert(0, 'City', df.City)
df_robust
Out[35]:
In [36]:
df = pd.read_csv('./asset/Median Price of Established House.txt', sep='\t')
df.head()
Out[36]:
In [37]:
# use bins=x to control the number of bins
df.hist(column=['Q1','Q3'],bins=6,alpha=0.5,figsize=(16, 6))
Out[37]:
In [38]:
df.plot.scatter(x='Q1', y='Q3');
Scatter matrix provide a better way to discover the relationships in data
In [39]:
from pandas.tools.plotting import scatter_matrix
scatter_matrix(df, alpha=0.9, figsize=(12, 12), diagonal='hist') # set the diagonal figures to be histograms
Out[39]:
In [ ]:
In [ ]: