In [1]:
import pandas as pd
In [2]:
mlo = pd.read_csv('../data/co2-mm-mlo.csv', na_values=-99.99, index_col='Date', parse_dates=True)
In [3]:
mlo.head()
Out[3]:
In [4]:
s = mlo['Interpolated']
In [5]:
mlo.assign(smooth=s.rolling(12).mean()).tail()
Out[5]:
A copy is returned.
In [6]:
mlo.head()
Out[6]:
In [7]:
s2 = mlo.loc[:'1958-05', 'Average']
s2
Out[7]:
A view is returned.
In [8]:
s2[:] = 313
In [9]:
s2
Out[9]:
In [10]:
mlo.head()
Out[10]:
s2
while preserving the original mlo
DataFrame (Hint: Remember the NumPy lesson.)
In [11]:
mlo['Average']['1958-03']
Out[11]:
In [12]:
mlo['Average']['1958-03'] = 312
Generally speaking, chained indexing is not a good practice. To set a new value, use mlo.loc[row_indexer, col_indexer]
because mlo.loc
is guaranteed to be mlo
itself.
In [13]:
mlo.loc['1958-03', 'Average']
Out[13]:
In [14]:
h_index = pd.MultiIndex.from_product([['first', 'second'], ['A', 'B']])
h_index
Out[14]:
In [15]:
x = pd.Series(range(4), index=h_index)
x
Out[15]:
In [16]:
x['first']
Out[16]:
In [17]:
x['first']['B']
Out[17]:
In the above, there are two selection operations.
In [18]:
x.loc[('first', 'B')]
Out[18]:
In the above, there is a single selection operation.
We can end up with a hierarchical index when stacking records.
In [19]:
gl = pd.read_csv('../data/co2-mm-gl.csv', na_values=-99.99, index_col='Date', parse_dates=True)
gl = gl[['Average']]
gl.columns = ['Average_gl']
gl.head()
Out[19]:
In [20]:
ml = mlo[['Average']]
ml.columns = ['Average_mlo']
ml.head()
Out[20]:
In [21]:
ml = ml[ml.index >= '1980-01']
gl = gl.head()
ml = ml.head()
In [22]:
multi = pd.concat([ml, gl], axis=1).stack()
multi
Out[22]:
In [23]:
multi.index
Out[23]:
In [24]:
multi.index.get_level_values('Date')
Out[24]:
In [25]:
multi.loc[multi.index.get_level_values('Date') < '1980-03']
Out[25]:
multi
series for the Average_mlo
variable.The stack()
function compressed a level in the DataFrame’s columns to produce a Series (as a reminder, multi = pd.concat([ml, gl], axis=1).stack()
).
In [26]:
pd.concat([ml, gl], axis=1)
Out[26]:
In [27]:
multi
Out[27]:
The inverse function is unstack()
; it is designed to work with a hierarchical index.
In [28]:
multi.unstack()
Out[28]:
x
.x.unstack(0)
return?
In [29]:
rec = pd.concat([ml, gl], axis=1).stack().reset_index()
rec.columns = ['date', 'variable', 'value']
rec
Out[29]:
The above data is in 'stacked' or 'record' format.
In [30]:
rec
Out[30]:
In [31]:
rec[rec.variable == 'Average_mlo']
Out[31]:
In [32]:
pivot_table = rec.pivot(index='date', columns='variable', values='value')
pivot_table
Out[32]:
The pivoted data is more suitable for timeseries analysis.
In [33]:
pivot_table['Average_gl']
Out[33]:
In [34]:
pivot_table.index
Out[34]:
x
so that you could apply the .pivot()
method on it?alt = pd.DataFrame({'x': [0, 0, 1, 1], 'y': [0, 1, 0, 1], 'z': [0.5, 0.8, 0.6, 0.3]})
. Pivot this DataFrame.alt
as a Series (of z values) with a multi-index. Unstack it. How does it compare with the pivot table above?