In [1]:
import pandas as pd
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
In [2]:
df = pd.read_csv('./asset/sydney_housing_market.txt', sep='\t')
df.head()
Out[2]:
In order to build up a pivot table, we must specify an index.
In [19]:
pd.pivot_table(df, index=['type'])
Out[19]:
Note that the default aggregation function is np.mean. We can specify the aggregation function in the aggfunc parameter, as shown below.
In [18]:
pd.pivot_table(df, index=['type'], aggfunc={'distance_to_CBD':np.mean, 'sold':np.sum})
Out[18]:
For simplicity, we will stick with the default aggregation function.
We also want to see value, but we need to change it into floats first
In [4]:
df['value']=df['value'].replace('[\$,]','',regex=True).astype(float)
pd.pivot_table(df, index=['type'])
Out[4]:
We could also choose more than one column as index
In [5]:
pd.pivot_table(df, index=['type','council'])
Out[5]:
columns provide an additional way to segment the data
In [6]:
pd.pivot_table(df, index=['council'], columns=['type'])
Out[6]:
Note that NaN implies that there is no data here
The default aggfunc is avg but we could use other functions such as np.sum
In [7]:
pd.pivot_table(df, index=['type'],aggfunc=np.sum)
Out[7]:
Use margins=True to show the total numbers
In [8]:
pd.pivot_table(df, index=['type','council'], aggfunc=np.sum, margins=True)
Out[8]:
We should use avg for value but sum for sold, and we do not want to see distance_to_CBD for now
In [9]:
pd.pivot_table(df, index=['council','suburb'],
columns=['type'],
values=['sold', 'value'],
aggfunc={'sold':np.sum, 'value':np.mean},
margins=True)
Out[9]:
We firstly build a pivot table
In [10]:
table = pd.pivot_table(df, index=['council'], columns=['type'], values=['sold'], aggfunc=np.sum, margins=True)
table
Out[10]:
We can just look at data from one city
In [11]:
table.query('council==["Randwick"]')
Out[11]:
We can also specify multiple values
In [12]:
table.query('council==["Rockdale","Lane Cove"]')
Out[12]:
Note: we can not query a pivot table from the columns. So you should put item in index if you want to query it.
Before you can execute the code below, you need to install plotly, register a free account with them, and create a profile that contains your own API key. See https://plot.ly/python/getting-started/
In [13]:
plot_table = table[:-1] # get rid of ALL
In [14]:
plot_table.sold.house
Out[14]:
In [15]:
table.sold.house
Out[15]:
In [16]:
plot_table.index
Out[16]:
In [17]:
trace1 = go.Bar(
x=plot_table.index,
y=plot_table.sold.house,
name='House'
)
trace2 = go.Bar(
x=plot_table.index,
y=plot_table.sold.unit,
name='Unit'
)
data = [trace1, trace2]
layout = go.Layout(
barmode='group'
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='pandas-notebook-plot2')
Out[17]:
Perform some analysis that interests you using the Sydney Acution Data at https://auction-results.domain.com.au/Proofed/PDF/Sydney_Domain.pdf
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: