In [1]:
from pandas import Series, DataFrame
import pandas as pd
In [2]:
from __future__ import division
from numpy.random import randn
import numpy as np
import os
import matplotlib.pyplot as plt
np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))
from pandas import Series, DataFrame
import pandas as pd
np.set_printoptions(precision=4)
In [3]:
%pwd
Out[3]:
In [4]:
obj = Series([4, 7, -5, 3])
obj
Out[4]:
In [5]:
obj.name
In [6]:
obj.values
Out[6]:
In [7]:
obj.index
Out[7]:
In [8]:
obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2
Out[8]:
In [9]:
obj2.index
Out[9]:
In [10]:
obj2.values
Out[10]:
In [11]:
obj2['a']
Out[11]:
In [12]:
obj2['d'] = 6
In [13]:
obj2[['c', 'a', 'd']]
Out[13]:
In [14]:
obj2[obj2 > 0]
Out[14]:
In [15]:
obj2 * 2
Out[15]:
In [16]:
np.exp(obj2)
Out[16]:
In [17]:
'b' in obj2
Out[17]:
In [18]:
'e' in obj2
Out[18]:
In [19]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)
obj3
Out[19]:
In [20]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=states)
obj4
Out[20]:
In [21]:
pd.isnull(obj4)
Out[21]:
In [22]:
pd.notnull(obj4)
Out[22]:
In [13]:
obj4.isnull()
Out[13]:
In [14]:
obj3
Out[14]:
In [15]:
obj4
Out[15]:
In [16]:
obj3 + obj4
Out[16]:
In [17]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4
Out[17]:
In [23]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj
Out[23]:
In [24]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)
In [25]:
frame
Out[25]:
In [26]:
DataFrame(data, columns=['year', 'state', 'pop'])
Out[26]:
In [27]:
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
index=['one', 'two', 'three', 'four', 'five'])
frame2
Out[27]:
In [28]:
frame2.columns
Out[28]:
A column in a DataFrame can be retrieved as a Series either by dict-like notation or by attribute:
In [29]:
frame2['state']
Out[29]:
In [30]:
frame2.year
Out[30]:
Rows can also be retrieved by position or name by a couple of methods, such as the ix indexing field
In [ ]:
frame2.ix['three']
Columns can be modified by assignment. For example, the empty 'debt' column could be assigned a scalar value or an array of values:
In [32]:
frame2['debt'] = 16.5
frame2
Out[32]:
In [33]:
frame2['debt'] = np.arange(5.)
frame2
Out[33]:
When assigning lists or arrays to a column, the value’s length must match the length of the DataFrame. If you assign a Series, it will be instead conformed exactly to the DataFrame’s index, inserting missing values in any holes:
In [34]:
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2
Out[34]:
Assigning a column that doesn’t exist will create a new column.
In [35]:
frame2['eastern'] = frame2.state == 'Ohio'
frame2
Out[35]:
The del keyword will delete columns as with a dict:
In [36]:
del frame2['eastern']
frame2.columns
Out[36]:
If passed to DataFrame, a nested dict of dics will be interpreted as: the outer dict keys as the columns and the inner keys as the row indices:
In [40]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
In [41]:
frame3 = DataFrame(pop)
frame3
Out[41]:
In [42]:
frame3.T
Out[42]:
The keys in the inner dicts are unioned and sorted to form the index in the result. This isn’t true if an explicit index is specified:
In [43]:
DataFrame(pop, index=[2001, 2002, 2003])
Out[43]:
Dicts of Series are treated much in the same way:
In [44]:
frame3['Ohio'][:-1]
Out[44]:
In [45]:
frame3['Nevada'][:2]
Out[45]:
In [46]:
pdata = {'Ohio': frame3['Ohio'][:-1],
'Nevada': frame3['Nevada'][:2]}
DataFrame(pdata)
Out[46]:
If a DataFrame’s index and columns have their name attributes set, these will also be displayed:
In [47]:
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3
Out[47]:
Like Series, the values attribute returns the data contained in the DataFrame as a 2D ndarray:
In [48]:
frame3.values
Out[48]:
If the DataFrame’s columns are different dtypes, the dtype of the values array will be chosen to accomodate all of the columns:
In [49]:
frame2.values
Out[49]:
In [50]:
obj = Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index
Out[50]:
In [51]:
index[1:]
Out[51]:
Index objects are immutable and thus can’t be modified by the user:
In [ ]:
# this below will generate an error
index[1] = 'd'
Immutability is important so that Index objects can be safely shared among data structures:
In [ ]:
index = pd.Index(np.arange(3))
obj2 = Series([1.5, -2.5, 0], index=index)
obj2.index is index
In addition to being array-like, an Index also functions as a fixed-size set:
In [52]:
frame3
Out[52]:
In [53]:
'Ohio' in frame3.columns
Out[53]:
In [54]:
2003 in frame3.index
Out[54]:
A critical method on pandas objects is reindex, which means to create a new object with the data conformed to a new index. Consider a simple example from above:
In [55]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj
Out[55]:
Calling reindex on this Series rearranges the data according to the new index, intro- ducing missing values if any index values were not already present:
In [56]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2
Out[56]:
For ordered data like time series, it may be desirable to do some interpolation or filling of values when reindexing. The method option allows us to do this, using a method such as ffill which forward fills the values:
In [57]:
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)
Out[57]:
In [58]:
obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3.reindex(range(6), method='ffill')
Out[58]:
In [59]:
frame = DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],
columns=['Ohio', 'Texas', 'California'])
frame
Out[59]:
In [60]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2
Out[60]:
The columns can be reindexed using the columns keyword:
In [61]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)
Out[61]:
Both can be reindexed in one shot, though interpolation will only apply row-wise (axis 0):
In [62]:
frame.reindex(index=['a', 'b', 'c', 'd'], method='ffill',
columns=states)
Out[62]:
As you’ll see soon, reindexing can be done more succinctly by label-indexing with ix:
In [ ]:
frame.ix[['a', 'b', 'c', 'd'], states]
Dropping one or more entries from an axis is easy if you have an index array or list without those entries. As that can require a bit of munging and set logic, the drop method will return a new object with the indicated value or values deleted from an axis:
In [63]:
obj = Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop('c')
new_obj
Out[63]:
In [64]:
obj.drop(['d', 'c'])
Out[64]:
With DataFrame, index values can be deleted from either axis:
In [65]:
data = DataFrame(np.arange(16).reshape((4, 4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
In [66]:
data.drop(['Colorado', 'Ohio'])
Out[66]:
In [69]:
data.drop('Utah', axis=0)
Out[69]:
In [67]:
data.drop('two', axis=1)
Out[67]:
In [68]:
data.drop(['two', 'four'], axis=1)
Out[68]:
Series indexing (obj[...]) works analogously to NumPy array indexing, except you can use the Series’s index values instead of only integers. Here are some examples of this:
In [72]:
obj = Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj
Out[72]:
In [73]:
obj['b']
Out[73]:
In [71]:
obj[1]
Out[71]:
In [74]:
obj[2:4]
Out[74]:
In [75]:
obj[['b', 'a', 'd']]
Out[75]:
In [76]:
obj[[1, 3]]
Out[76]:
In [77]:
obj[obj < 2]
Out[77]:
Slicing with labels behaves differently than normal Python slicing in that the endpoint is inclusive:
In [78]:
obj['b':'c']
Out[78]:
Setting using these methods works just as you would expect:
In [79]:
obj['b':'c'] = 5
obj
Out[79]:
As you’ve seen above, indexing into a DataFrame is for retrieving one or more columns either with a single value or sequence:
In [80]:
data = DataFrame(np.arange(16).reshape((4, 4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data
Out[80]:
In [81]:
data['two']
Out[81]:
In [82]:
data[['three', 'one']]
Out[82]:
Indexing like this has a few special cases. First selecting rows by slicing or a boolean array:
In [83]:
data[:2]
Out[83]:
In [84]:
data[data['three'] > 5]
Out[84]:
In [85]:
data < 5
Out[85]:
In [86]:
data[data < 5] = 0
In [87]:
data
Out[87]:
Indexing field ix enables you to select a subset of the rows and columns from a DataFrame with NumPy- like notation plus axis labels. This is also a less verbose way to do reindexing:
In [88]:
data.ix['Colorado', ['two', 'three']]
Out[88]:
In [89]:
data.ix[['Colorado', 'Utah'], [3, 0, 1]]
Out[89]:
In [90]:
data.ix[2]
Out[90]:
In [91]:
data.ix[:'Utah', 'two']
Out[91]:
In [92]:
data.ix[data.three > 5, :3]
Out[92]:
One of the most important pandas features is the behavior of arithmetic between ob- jects with different indexes. When adding together objects, if any index pairs are not the same, the respective index in the result will be the union of the index pairs. Let’s look at a simple example:
In [93]:
s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
In [94]:
s1
Out[94]:
In [95]:
s2
Out[95]:
In [96]:
s1 + s2
Out[96]:
The internal data alignment introduces NA values in the indices that don’t overlap. Missing values propagate in arithmetic computations.
In the case of DataFrame, alignment is performed on both the rows and the columns:
In [97]:
df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
index=['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1
Out[97]:
In [98]:
df2
Out[98]:
Adding these together returns a DataFrame whose index and columns are the unions of the ones in each DataFrame:
In [99]:
df1 + df2
Out[99]:
In [100]:
df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))
df1
Out[100]:
In [101]:
df2
Out[101]:
In [102]:
df1 + df2
Out[102]:
In [103]:
df1.add(df2, fill_value=0)
Out[103]:
Relatedly, when reindexing a Series or DataFrame, you can also specify a different fill value:
In [104]:
df1.reindex(columns=df2.columns, fill_value=0)
Out[104]:
In [105]:
arr = np.arange(12.).reshape((3, 4))
arr
Out[105]:
In [106]:
arr[0]
Out[106]:
In [107]:
arr - arr[0]
Out[107]:
This is referred to as broadcasting. Operations between a DataFrame and a Series are similar:
In [108]:
frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.ix[0]
frame
Out[108]:
In [109]:
series
Out[109]:
By default, arithmetic between DataFrame and Series matches the index of the Series on the DataFrame's columns, broadcasting down the rows:
In [110]:
frame - series
Out[110]:
If an index value is not found in either the DataFrame’s columns or the Series’s index, the objects will be reindexed to form the union:
In [115]:
series2 = Series(range(3), index=['b', 'e', 'f'])
series2
Out[115]:
In [116]:
frame + series2
Out[116]:
If you want to instead broadcast over the columns, matching on the rows, you have to use one of the arithmetic methods. For example:
In [112]:
series3 = frame['d']
frame
Out[112]:
In [113]:
series3
Out[113]:
In [114]:
frame.sub(series3, axis=0)
Out[114]:
In [117]:
frame = DataFrame(np.random.randn(4, 3), columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
In [118]:
frame
Out[118]:
In [119]:
np.abs(frame)
Out[119]:
In [120]:
f = lambda x: x.max() - x.min()
In [121]:
frame.apply(f)
Out[121]:
In [122]:
frame.apply(f, axis=1)
Out[122]:
Many of the most common array statistics (like sum and mean) are DataFrame methods, so using apply is not necessary.
The function passed to apply need not return a scalar value, it can also return a Series with multiple values:
In [123]:
def f(x):
return Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)
Out[123]:
Element-wise Python functions can be used, too. Suppose you wanted to compute a formatted string from each floating point value in frame. You can do this with applymap:
In [124]:
format = lambda x: '%.2f' % x
frame.applymap(format)
Out[124]:
The reason for the name applymap is that Series has a map method for applying an element-wise function:
In [125]:
frame['e'].map(format)
Out[125]:
Sorting a data set by some criterion is another important built-in operation. To sort lexicographically by row or column index, use the sort_index method, which returns a new, sorted object:
In [126]:
obj = Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()
Out[126]:
With a DataFrame, you can sort by index on either axis:
In [127]:
frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
columns=['d', 'a', 'b', 'c'])
frame.sort_index()
Out[127]:
In [128]:
frame.sort_index(axis=1)
Out[128]:
The data is sorted in ascending order by default, but can be sorted in descending order, too:
In [129]:
frame.sort_index(axis=1, ascending=False)
Out[129]:
To sort a Series by its values, use its order method:
In [130]:
obj = Series([4, 7, -3, 2])
obj.order()
Out[130]:
In [131]:
obj.sort_values()
Out[131]:
Any missing values are sorted to the end of the Series by default:
In [133]:
obj = Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()
Out[133]:
On DataFrame, you may want to sort by the values in one or more columns. To do so, pass one or more column names to the by option:
In [134]:
frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame
Out[134]:
In [135]:
frame.sort_index(by='b')
Out[135]:
In [137]:
frame.sort_values(by='b')
Out[137]:
In [138]:
frame.sort_index(by=['a', 'b'])
Out[138]:
In [139]:
frame.sort_values(by=['a', 'b'])
Out[139]:
Ranking is closely related to sorting, assigning ranks from one through the number of valid data points in an array. It is similar to the indirect sort indices produced by numpy.argsort, except that ties are broken according to a rule. The rank methods for Series and DataFrame are the place to look; by default rank breaks ties by assigning each group the mean rank:
In [141]:
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj
Out[141]:
In [143]:
obj.rank?
In [142]:
obj.rank()
Out[142]:
Ranks can also be assigned according to the order they’re observed in the data:
In [144]:
obj.rank(method='first')
Out[144]:
Naturally, you can rank in descending order, too:
In [147]:
obj.rank(ascending=False, method='max')
Out[147]:
The "method" is used for breaking ties:
In [ ]:
frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
'c': [-2, 5, 8, -2.5]})
frame
In [ ]:
frame.rank(axis=1)
While many pandas functions (like reindex) require that the labels be unique, it’s not mandatory. Let’s consider a small Series with duplicate indices:
In [148]:
obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj
Out[148]:
In [149]:
obj.index.is_unique
Out[149]:
Data selection is one of the main things that behaves differently with duplicates. In- dexing a value with multiple entries returns a Series while single entries return a scalar value:
In [150]:
obj['a']
Out[150]:
In [151]:
obj['c']
Out[151]:
The same logic extends to indexing rows in a DataFrame:
In [152]:
df = DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df
Out[152]:
In [153]:
df.ix['b']
Out[153]:
pandas objects are equipped with a set of common mathematical and statistical meth- ods. Most of these fall into the category of reductions or summary statistics, methods that extract a single value (like the sum or mean) from a Series or a Series of values from the rows or columns of a DataFrame. Compared with the equivalent methods of vanilla NumPy arrays, they are all built from the ground up to exclude missing data. Consider a small DataFrame:
In [154]:
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
Out[154]:
Calling DataFrame’s sum method returns a Series containing column sums:
In [155]:
df.sum()
Out[155]:
Passing axis=1 sums over the rows instead:
In [156]:
df.sum(axis=1)
Out[156]:
NA values are excluded unless the entire slice (row or column in this case) is NA. This can be disabled using the skipna option:
In [157]:
df.mean(axis=1, skipna=False)
Out[157]:
Common options for each reduction method are:
Some methods, like idxmin and idxmax, return indirect statistics like the index value where the minimum or maximum values are attained:
In [158]:
df.idxmax()
Out[158]:
Other methods are accumulations:
In [159]:
df.cumsum()
Out[159]:
Another type of method is neither a reduction nor an accumulation. describe is one such example, producing multiple summary statistics in one shot:
In [160]:
df.describe()
Out[160]:
On non-numeric data, describe produces alternate summary statistics:
In [161]:
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()
Out[161]:
Some summary statistics, like correlation and covariance, are computed from pairs of arguments. Let’s consider some DataFrames of stock prices and volumes obtained from Yahoo! Finance:
In [162]:
import pandas.io.data as web
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
all_data[ticker] = web.get_data_yahoo(ticker)
price = DataFrame({tic: data['Adj Close']
for tic, data in all_data.iteritems()})
volume = DataFrame({tic: data['Volume']
for tic, data in all_data.iteritems()})
Now, compute percent changes of the prices:
In [201]:
price.head()
Out[201]:
In [163]:
returns = priceA.pct_change()
returns.tail()
Out[163]:
The corr method of Series computes the correlation of the overlapping, non-NA, aligned-by-index values in two Series. Relatedly, cov computes the covariance:
In [166]:
returns.MSFT.corr(returns.IBM)
Out[166]:
In [167]:
returns.MSFT.cov(returns.IBM)
Out[167]:
DataFrame’s corr and cov methods, on the other hand, return a full correlation or covariance matrix as a DataFrame, respectively:
In [168]:
returns.corr()
Out[168]:
In [169]:
returns.cov()
Out[169]:
Using DataFrame’s corrwith method, you can compute pairwise correlations between a DataFrame’s columns or rows with another Series or DataFrame. Passing a Series returns a Series with the correlation value computed for each column:
In [170]:
returns.corrwith(returns.IBM)
Out[170]:
Passing a DataFrame computes the correlations of matching column names. Here I compute correlations of percent changes with volume:
In [171]:
returns.corrwith(volume)
Out[171]:
Passing axis=1 does things row-wise instead. In all cases, the data points are aligned by label before computing the correlation.
Another class of related methods extracts information about the values contained in a one-dimensional Series.
To illustrate these, consider this example:
In [172]:
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
The first function is unique, which gives you an array of the unique values in a Series:
In [173]:
uniques = obj.unique()
uniques
Out[173]:
The unique values are not necessarily returned in sorted order, but could be sorted after the fact if needed (uniques.sort()). Relatedly, value_counts computes a Series con- taining value frequencies:
In [174]:
obj.value_counts()
Out[174]:
The Series is sorted by value in descending order as a convenience. value_counts is also available as a top-level pandas method that can be used with any array or sequence:
In [175]:
pd.value_counts(obj.values, sort=False)
Out[175]:
Lastly, isin is responsible for vectorized set membership and can be very useful in filtering a data set down to a subset of values in a Series or column in a DataFrame:
In [176]:
mask = obj.isin(['b', 'c'])
mask
Out[176]:
In [177]:
obj[mask]
Out[177]:
In some cases, you may want to compute a histogram on multiple related columns in a DataFrame. Here’s an example:
In [178]:
data = DataFrame({'Qu1': [1, 3, 4, 3, 4],
'Qu2': [2, 3, 1, 2, 3],
'Qu3': [1, 5, 2, 4, 4]})
data
Out[178]:
Passing pandas.value_counts to this DataFrame’s apply function gives:
In [179]:
result = data.apply(pd.value_counts).fillna(0)
result
Out[179]:
Missing data is common in most data analysis applications. One of the goals in de- signing pandas was to make working with missing data as painless as possible. For example, all of the descriptive statistics on pandas objects exclude missing data as you’ve seen earlier.
pandas uses the floating point value NaN (Not a Number) to represent missing data in both floating as well as in non-floating point arrays. It is just used as a sentinel that can be easily detected:
In [260]:
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
Out[260]:
In [261]:
string_data.isnull()
Out[261]:
The built-in Python None value is also treated as NA in object arrays:
In [182]:
string_data[0] = None
string_data.isnull()
Out[182]:
Common NA handling methods are:
You have a number of options for filtering out missing data. While doing it by hand is always an option, dropna can be very helpful. On a Series, it returns the Series with only the non-null data and index values:
In [262]:
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data.dropna()
Out[262]:
Naturally, you could have computed this yourself by boolean indexing:
In [185]:
data[data.notnull()]
Out[185]:
With DataFrame objects, these are a bit more complex. You may want to drop rows or columns which are all NA or just those containing any NAs. dropna by default drops any row containing a missing value:
In [186]:
data = DataFrame([[1., 6.5, 3.], [1., NA, NA],
[NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
data
Out[186]:
In [187]:
cleaned
Out[187]:
Passing how='all' will only drop rows that are all NA:
In [188]:
data.dropna(how='all')
Out[188]:
Dropping columns in the same way is only a matter of passing axis=1:
In [189]:
data[4] = NA
data
Out[189]:
In [190]:
data.dropna(axis=1, how='all')
Out[190]:
A related way to filter out DataFrame rows tends to concern time series data. Suppose you want to keep only rows containing a certain number of observations. You can indicate this with the thresh argument:
In [263]:
df = DataFrame(np.random.randn(7, 3))
df.ix[:4, 1] = NA; df.ix[:2, 2] = NA
df
Out[263]:
In [264]:
df.dropna(thresh=3)
Out[264]:
Rather than filtering out missing data (and potentially discarding other data along with it), you may want to fill in the “holes” in any number of ways. For most purposes, the fillna method is the workhorse function to use. Calling fillna with a constant replaces missing values with that value:
In [265]:
df.fillna(0)
Out[265]:
Calling fillna with a dict you can use a different fill value for each column:
In [195]:
df.fillna({1: 0.5, 2: -1})
Out[195]:
fillna returns a new object, but you can modify the existing object in place:
In [196]:
# always returns a reference to the filled object
_ = df.fillna(0, inplace=True)
df
Out[196]:
The same interpolation methods available for reindexing can be used with fillna:
In [269]:
df = DataFrame(np.random.randn(6, 3))
df.ix[2:, 1] = NA; df.ix[4:, 2] = NA
df
Out[269]:
In [270]:
df.fillna(method='ffill')
Out[270]:
In [271]:
df.fillna(method='ffill', limit=2)
Out[271]:
With fillna you can do lots of other things with a little creativity. For example, you might pass the mean or median value of a Series:
In [199]:
data = Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())
Out[199]:
Hierarchical indexing is an important feature of pandas enabling you to have multiple (two or more) index levels on an axis. Somewhat abstractly, it provides a way for you to work with higher dimensional data in a lower dimensional form. Let’s start with a simple example; create a Series with a list of lists or arrays as the index:
In [266]:
data = Series(np.random.randn(10),
index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
[1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
data
Out[266]:
In [267]:
data.index
Out[267]:
In [ ]:
data['b']
In [ ]:
data['b':'c']
In [ ]:
data.ix[['b', 'd']]
In [ ]:
data[:, 2]
In [ ]:
data.unstack()
In [ ]:
data.unstack().stack()
In [ ]:
frame = DataFrame(np.arange(12).reshape((4, 3)),
index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns=[['Ohio', 'Ohio', 'Colorado'],
['Green', 'Red', 'Green']])
frame
In [ ]:
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame
In [ ]:
frame['Ohio']
In [ ]:
frame.swaplevel('key1', 'key2')
In [ ]:
frame.sortlevel(1)
In [ ]:
frame.swaplevel(0, 1).sortlevel(0)
In [ ]:
frame.sum(level='key2')
In [ ]:
frame.sum(level='color', axis=1)
In [ ]:
frame = DataFrame({'a': range(7), 'b': range(7, 0, -1),
'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
'd': [0, 1, 2, 0, 1, 2, 3]})
frame
In [ ]:
frame2 = frame.set_index(['c', 'd'])
frame2
In [ ]:
frame.set_index(['c', 'd'], drop=False)
In [ ]:
frame2.reset_index()
In [ ]:
ser = Series(np.arange(3.))
ser.iloc[-1]
In [ ]:
ser
In [ ]:
ser2 = Series(np.arange(3.), index=['a', 'b', 'c'])
ser2[-1]
In [ ]:
ser.ix[:1]
In [ ]:
ser3 = Series(range(3), index=[-5, 1, 3])
ser3.iloc[2]
In [ ]:
frame = DataFrame(np.arange(6).reshape((3, 2)), index=[2, 0, 1])
frame.iloc[0]
In [ ]:
import pandas.io.data as web
pdata = pd.Panel(dict((stk, web.get_data_yahoo(stk))
for stk in ['AAPL', 'GOOG', 'MSFT', 'DELL']))
In [ ]:
pdata
In [ ]:
pdata = pdata.swapaxes('items', 'minor')
pdata['Adj Close']
In [ ]:
pdata.ix[:, '6/1/2012', :]
In [ ]:
pdata.ix['Adj Close', '5/22/2012':, :]
In [ ]:
stacked = pdata.ix[:, '5/30/2012':, :].to_frame()
stacked
In [ ]:
stacked.to_panel()