In [1]:
import pandas as pd
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
In [2]:
df = pd.read_csv('./asset/sydney_housing_market.txt', sep='\t')
df.head()
Out[2]:
In order to build up a pivot table, we must specify an index.
In [19]:
pd.pivot_table(df, index=['type'])
Out[19]:
Note that the default aggregation function is np.mean
. We can specify the aggregation function in the aggfunc
parameter, as shown below.
In [18]:
pd.pivot_table(df, index=['type'], aggfunc={'distance_to_CBD':np.mean, 'sold':np.sum})
Out[18]:
For simplicity, we will stick with the default aggregation function.
We also want to see value, but we need to change it into floats first
In [4]:
df['value']=df['value'].replace('[\$,]','',regex=True).astype(float)
pd.pivot_table(df, index=['type'])
Out[4]:
We could also choose more than one column as index
In [5]:
pd.pivot_table(df, index=['type','council'])
Out[5]:
columns
provide an additional way to segment the data
In [6]:
pd.pivot_table(df, index=['council'], columns=['type'])
Out[6]:
Note that NaN
implies that there is no data here
The default aggfunc
is avg
but we could use other functions such as np.sum
In [7]:
pd.pivot_table(df, index=['type'],aggfunc=np.sum)
Out[7]:
Use margins=True
to show the total numbers
In [8]:
pd.pivot_table(df, index=['type','council'], aggfunc=np.sum, margins=True)
Out[8]:
We should use avg for value but sum for sold, and we do not want to see distance_to_CBD for now
In [9]:
pd.pivot_table(df, index=['council','suburb'],
columns=['type'],
values=['sold', 'value'],
aggfunc={'sold':np.sum, 'value':np.mean},
margins=True)
Out[9]:
We firstly build a pivot table
In [10]:
table = pd.pivot_table(df, index=['council'], columns=['type'], values=['sold'], aggfunc=np.sum, margins=True)
table
Out[10]:
We can just look at data from one city
In [11]:
table.query('council==["Randwick"]')
Out[11]:
We can also specify multiple values
In [12]:
table.query('council==["Rockdale","Lane Cove"]')
Out[12]:
Note: we can not query a pivot table from the columns
. So you should put item in index
if you want to query it.
Before you can execute the code below, you need to install plotly
, register a free account with them, and create a profile that contains your own API key. See https://plot.ly/python/getting-started/
In [13]:
plot_table = table[:-1] # get rid of ALL
In [14]:
plot_table.sold.house
Out[14]:
In [15]:
table.sold.house
Out[15]:
In [16]:
plot_table.index
Out[16]:
In [17]:
trace1 = go.Bar(
x=plot_table.index,
y=plot_table.sold.house,
name='House'
)
trace2 = go.Bar(
x=plot_table.index,
y=plot_table.sold.unit,
name='Unit'
)
data = [trace1, trace2]
layout = go.Layout(
barmode='group'
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='pandas-notebook-plot2')
Out[17]:
Perform some analysis that interests you using the Sydney Acution Data at https://auction-results.domain.com.au/Proofed/PDF/Sydney_Domain.pdf
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: