In [1]:

    
# import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

Object Creation - Series and DataFrame



In [2]:

    
# creating Series (array)

s1 = pd.Series([[1,3,5,np.nan,6,8]])
s1









    Out[2]:





0    [1, 3, 5, nan, 6, 8]
dtype: object



In [3]:

    
# creating DataFrames
dates = pd.date_range('20130101', periods=6)
dates









    Out[3]:





DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')



In [4]:

    
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df



In [5]:

    
# Creating a DataFrame by passing a dict of objects that can be converted to series-like.
df2 = pd.DataFrame({ 'A' : 1.,
                         'B' : pd.Timestamp('20130102'),
                         'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                         'D' : np.array([3] * 4,dtype='int32'),
                         'E' : pd.Categorical(["test","train","test","train"]),
                         'F' : 'foo' })
df2



In [6]:

    
# checking Data Types
df2.dtypes









    Out[6]:





A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object



In [7]:

    
# Use df2.<TAB> for column name completion as well as attributes which can work on dataframe

Viewing Data



In [8]:

    
# for first 5 records
df.head()



In [9]:

    
# for last 3
df.tail(3)



In [10]:

    
# checking df index
df.index









    Out[10]:





DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')



In [11]:

    
# column names
df.columns









    Out[11]:





Index(['A', 'B', 'C', 'D'], dtype='object')



In [12]:

    
# df values
df.values









    Out[12]:





array([[-0.00557444, -0.38565665, -0.19326961,  0.26490064],
       [-0.20867983, -0.29648787, -0.79000981, -1.39103244],
       [-1.6166742 , -0.04217266, -1.35645484, -0.71763967],
       [ 2.15684898, -0.20123222,  0.16578647, -0.16246515],
       [-2.01986196, -2.05520516,  1.36948649,  1.18219902],
       [ 0.81525976,  0.76637688,  0.16043882,  0.35385518]])

Summary of Data



In [13]:

    
# information abt df
df.info()









    



<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2013-01-01 to 2013-01-06
Freq: D
Data columns (total 4 columns):
A    6 non-null float64
B    6 non-null float64
C    6 non-null float64
D    6 non-null float64
dtypes: float64(4)
memory usage: 240.0 bytes



In [14]:

    
# describing stastistic summay
df.describe()

Transposing and Sorting the data



In [15]:

    
df.T









    Out[15]:






  
    
      
      2013-01-01 00:00:00
      2013-01-02 00:00:00
      2013-01-03 00:00:00
      2013-01-04 00:00:00
      2013-01-05 00:00:00
      2013-01-06 00:00:00
    
  
  
    
      A
      -0.005574
      -0.208680
      -1.616674
      2.156849
      -2.019862
      0.815260
    
    
      B
      -0.385657
      -0.296488
      -0.042173
      -0.201232
      -2.055205
      0.766377
    
    
      C
      -0.193270
      -0.790010
      -1.356455
      0.165786
      1.369486
      0.160439
    
    
      D
      0.264901
      -1.391032
      -0.717640
      -0.162465
      1.182199
      0.353855



In [16]:

    
# Sorting by index
df.sort_index(axis=1, ascending=False)



In [17]:

    
# sorting by values
df.sort_values(by="B")



In [18]:

    
df.sort_values(by=["B", "A"])

Selection of Data

In pandas, data can be accessed with these methods .at, .iat, .loc, .iloc and .ix



In [19]:

    
# selecting a column A
df['A']









    Out[19]:





2013-01-01   -0.005574
2013-01-02   -0.208680
2013-01-03   -1.616674
2013-01-04    2.156849
2013-01-05   -2.019862
2013-01-06    0.815260
Freq: D, Name: A, dtype: float64



In [20]:

    
# slicing rows
df[0:3]



In [21]:

    
df['2013-01-01':'2013-01-03']

Selection by lable



In [22]:

    
# cross selection using a lable
df.loc[dates[0]]









    Out[22]:





A   -0.005574
B   -0.385657
C   -0.193270
D    0.264901
Name: 2013-01-01 00:00:00, dtype: float64



In [23]:

    
df.loc[:, ['A', 'B']]   # [row, column]



In [24]:

    
# Showing label slicing, both endpoints are included
df.loc['20130102':'20130104',['A','B']]



In [25]:

    
df.loc['20130102',['A','B']]









    Out[25]:





A   -0.208680
B   -0.296488
Name: 2013-01-02 00:00:00, dtype: float64



In [26]:

    
# For getting a scalar value
df.loc['20130102',['A']]









    Out[26]:





A   -0.20868
Name: 2013-01-02 00:00:00, dtype: float64



In [27]:

    
df.loc['20130102','B']









    Out[27]:





-0.29648786843717295



In [28]:

    
# for faster access
#df.at['20130102', 'A']

Selection by Position



In [29]:

    
# Select via the position of the passed integers
df.iloc[3]









    Out[29]:





A    2.156849
B   -0.201232
C    0.165786
D   -0.162465
Name: 2013-01-04 00:00:00, dtype: float64



In [30]:

    
# By integer slices, acting similar to numpy/python
df.iloc[3:5, 2:4]   # row - 3 n 4 , col = 2, 3



In [31]:

    
# By lists of integer position locations, similar to the numpy/python style
df.iloc[[1,3,4], 2:4]



In [32]:

    
# For slicing rows explicitly
df.iloc[1:3,:]



In [33]:

    
# For slicing columns explicitly
df.iloc[:,1:3]



In [34]:

    
# For getting a value explicitly
df.iloc[1,1]









    Out[34]:





-0.29648786843717295



In [35]:

    
df.loc['2013-01-02', 'B']









    Out[35]:





-0.29648786843717295



In [36]:

    
# For getting fast access to a scalar (equiv to the prior method)
df.iat[1,1]









    Out[36]:





-0.29648786843717295

Boolean Indexing



In [37]:

    
df.B > 0









    Out[37]:





2013-01-01    False
2013-01-02    False
2013-01-03    False
2013-01-04    False
2013-01-05    False
2013-01-06     True
Freq: D, Name: B, dtype: bool



In [38]:

    
df[df.B > 0]



In [39]:

    
df[df>0]



In [40]:

    
# Using the isin() method for filtering:
df3 = df.copy()

df3['E'] = ['one', 'one','two','three','four','three']

df3



In [41]:

    
df3[df3['E'].isin(['two','four'])]

Setting a new column



In [42]:

    
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1









    Out[42]:





2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64



In [43]:

    
df['F'] = s1
df



In [44]:

    
# Setting values by label
df.at[dates[0],'A'] = 0
df



In [45]:

    
# Setting values by position
df.iat[0,1] = 0
df



In [46]:

    
# Setting by assigning with a numpy array
df.loc[:,'D'] = np.array([5] * len(df))
df



In [47]:

    
df2 = df.copy()
df2



In [48]:

    
# to replace all the positive value from its negative

df2[df2 > 0] = -df2
df2



In [49]:

    
df2[df2 < 0] = -df2
df2

Missing Data



In [50]:

    
# Reindexing allows you to change/add/delete the index on a specified axis. This returns a copy of the data.
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1



In [51]:

    
df1.loc[dates[1],'E'] = 1
df1



In [52]:

    
# to drop any rows that have missing data.
df1.dropna(how='any')   # if any columns have NULL or NaN



In [53]:

    
df1.dropna()



In [54]:

    
df1.dropna(how='all')   # if ALL columns have NULL or NaN

Filling missing data



In [55]:

    
df1.fillna(3)



In [56]:

    
df1.fillna(value=4)



In [57]:

    
df1.fillna({'F':3, 'E':2.9})   # Fill F column with 3 and E column wih 2.9



In [58]:

    
pd.isnull(df1)









    Out[58]:






  
    
      
      A
      B
      C
      D
      F
      E
    
  
  
    
      2013-01-01
      False
      False
      False
      False
      True
      True
    
    
      2013-01-02
      False
      False
      False
      False
      False
      False
    
    
      2013-01-03
      False
      False
      False
      False
      False
      True
    
    
      2013-01-04
      False
      False
      False
      False
      False
      True

Stats



In [59]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2013-01-01 to 2013-01-06
Freq: D
Data columns (total 5 columns):
A    6 non-null float64
B    6 non-null float64
C    6 non-null float64
D    6 non-null int32
F    5 non-null float64
dtypes: float64(4), int32(1)
memory usage: 264.0 bytes



In [60]:

    
df.describe()



In [61]:

    
df.count()









    Out[61]:





A    6
B    6
C    6
D    6
F    5
dtype: int64



In [62]:

    
df.mean()  # column wise









    Out[62]:





A   -0.145518
B   -0.304787
C   -0.107337
D    5.000000
F    3.000000
dtype: float64



In [63]:

    
df.mean(1)  # row wise









    Out[63]:





2013-01-01    1.201683
2013-01-02    0.940964
2013-01-03    0.796940
2013-01-04    2.024281
2013-01-05    1.258884
2013-01-06    2.348415
Freq: D, dtype: float64



In [64]:

    
df.std()









    Out[64]:





A    1.544449
B    0.936377
C    0.933920
D    0.000000
F    1.581139
dtype: float64



In [65]:

    
df.std(1)









    Out[65]:





2013-01-01    2.533850
2013-01-02    2.362652
2013-01-03    2.751355
2013-01-04    2.134913
2013-01-05    3.288437
2013-01-06    2.434258
Freq: D, dtype: float64



In [66]:

    
s = pd.Series([1,3,5,np.nan,6,8], index=dates)
s









    Out[66]:





2013-01-01    1.0
2013-01-02    3.0
2013-01-03    5.0
2013-01-04    NaN
2013-01-05    6.0
2013-01-06    8.0
Freq: D, dtype: float64



In [67]:

    
s = s.shift(2)  # shifting the content by 2 index
s









    Out[67]:





2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64



In [68]:

    
df.sub(s, axis='index')

Apply



In [69]:

    
df.apply(np.cumsum)



In [70]:

    
df.apply(lambda x: x.max() - x.min())









    Out[70]:





A    4.176711
B    2.821582
C    2.725941
D    0.000000
F    4.000000
dtype: float64

Histogramming



In [71]:

    
s = pd.Series(np.random.randint(0, 7, size=10))
s









    Out[71]:





0    0
1    1
2    5
3    0
4    0
5    1
6    4
7    4
8    3
9    0
dtype: int32



In [72]:

    
s.value_counts()  # checking unique value counts









    Out[72]:





0    4
4    2
1    2
5    1
3    1
dtype: int64

String Methods



In [73]:

    
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s









    Out[73]:





0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object



In [74]:

    
s.str.lower()









    Out[74]:





0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

Merge

Concat



In [75]:

    
df = pd.DataFrame(np.random.randn(10, 4))
df



In [76]:

    
pieces = [df[:3], df[3:7], df[7:]]
pieces









    Out[76]:





[          0         1         2         3
 0  0.862392 -0.487448  1.767044  1.619584
 1 -0.192533  0.452153  0.160491  0.163961
 2  0.646025 -0.710062 -0.764748  2.061172,
           0         1         2         3
 3 -1.356019 -0.077627 -0.947133  0.952742
 4  0.883336  1.352857 -1.960570  1.267786
 5  1.811561 -2.001506 -0.595290 -0.820276
 6 -2.535877 -0.415728 -0.000989 -1.231573,
           0         1         2         3
 7 -0.021901  1.382943 -0.957129 -0.228767
 8  0.813857 -0.864718  1.981206  1.672659
 9 -0.443268  0.921847 -0.548603  0.190629]



In [77]:

    
pd.concat(pieces)  # concat rowwise



In [78]:

    
pd.concat(pieces, axis=1)  # concat column wise

Join

pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False)



In [79]:

    
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

left



In [80]:

    
right



In [81]:

    
pd.merge(left, right, on='key')



In [82]:

    
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})

left



In [83]:

    
pd.merge(left, right, on='key')

Append



In [84]:

    
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df



In [85]:

    
s = df.iloc[3]
s









    Out[85]:





A   -0.316136
B    0.618400
C   -1.937116
D   -0.129132
Name: 3, dtype: float64



In [86]:

    
df.append(s, ignore_index=True)
df

Grouping

By “group by” we are referring to a process involving one or more of the following steps

    Splitting the data into groups based on some criteria
    Applying a function to each group independently
    Combining the results into a data structure



In [87]:

    
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                       'B' : ['one', 'one', 'two', 'three',
                              'two', 'two', 'one', 'three'],
                       'C' : np.random.randn(8),
                       'D' : np.random.randn(8)})
df



In [88]:

    
df.groupby('A').sum()



In [89]:

    
df.groupby(['A','B']).sum()

Reshaping



In [90]:

    
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
                     'foo', 'foo', 'qux', 'qux'],
                        ['one', 'two', 'one', 'two',
                         'one', 'two', 'one', 'two']]))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index









    Out[90]:





MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])



In [91]:

    
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df

The stack() method “compresses” a level in the DataFrame’s columns



In [92]:

    
df.stack()









    Out[92]:





first  second   
bar    one     A    0.077103
               B    0.071665
       two     A    0.243469
               B    0.789529
baz    one     A   -0.452906
               B    0.148716
       two     A   -0.829278
               B   -1.180111
foo    one     A   -0.251384
               B   -0.004602
       two     A   -0.857481
               B    0.512469
qux    one     A    1.660127
               B   -1.580966
       two     A    0.521759
               B   -1.693887
dtype: float64

With a “stacked” DataFrame or Series (having a MultiIndex as the index), the inverse operation of stack() is unstack(), which by default unstacks the last level:



In [93]:

    
df.unstack()  # this will unstack the inner index (last level)



In [94]:

    
df.unstack('first')  # unstacking by lable



In [95]:

    
df.unstack('second')



In [96]:

    
df.unstack(0)    # unstacking by index

Pivot Tables



In [97]:

    
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                       'B' : ['A', 'B', 'C'] * 4,
                       'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                       'D' : np.random.randn(12),
                       'E' : np.random.randn(12)})
df



In [98]:

    
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])

Time Series



In [99]:

    
rng = pd.date_range('1/1/2012', periods=100, freq='S')
rng[:5]









    Out[99]:





DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
               '2012-01-01 00:00:02', '2012-01-01 00:00:03',
               '2012-01-01 00:00:04'],
              dtype='datetime64[ns]', freq='S')



In [100]:

    
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts[:8]









    Out[100]:





2012-01-01 00:00:00      0
2012-01-01 00:00:01    175
2012-01-01 00:00:02      9
2012-01-01 00:00:03    173
2012-01-01 00:00:04    285
2012-01-01 00:00:05    262
2012-01-01 00:00:06    148
2012-01-01 00:00:07    255
Freq: S, dtype: int32



In [101]:

    
ts.sum()









    Out[101]:





26318



In [102]:

    
ts.resample('5Min').sum()









    Out[102]:





2012-01-01    26318
Freq: 5T, dtype: int32



In [103]:

    
ts.resample('5Min')









    



C:\Anaconda3\lib\site-packages\IPython\lib\pretty.py:108: FutureWarning: .resample() is now a deferred operation
use .resample(...).mean() instead of .resample(...)
  return getattr(obj, attr, default)






    Out[103]:





DatetimeIndexResampler [freq=<5 * Minutes>, axis=0, closed=left, label=left, convention=start, base=0]

Time zone representation



In [104]:

    
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
rng









    Out[104]:





DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09',
               '2012-03-10'],
              dtype='datetime64[ns]', freq='D')



In [105]:

    
ts = pd.Series(np.random.randn(len(rng)), rng)
ts









    Out[105]:





2012-03-06    1.166836
2012-03-07    0.309653
2012-03-08    0.953754
2012-03-09    0.606327
2012-03-10   -0.804818
Freq: D, dtype: float64



In [106]:

    
ts_utc = ts.tz_localize('UTC')
ts_utc









    Out[106]:





2012-03-06 00:00:00+00:00    1.166836
2012-03-07 00:00:00+00:00    0.309653
2012-03-08 00:00:00+00:00    0.953754
2012-03-09 00:00:00+00:00    0.606327
2012-03-10 00:00:00+00:00   -0.804818
Freq: D, dtype: float64

Convert to another time zone



In [107]:

    
ts_utc.tz_convert('Asia/Calcutta')









    Out[107]:





2012-03-06 05:30:00+05:30    1.166836
2012-03-07 05:30:00+05:30    0.309653
2012-03-08 05:30:00+05:30    0.953754
2012-03-09 05:30:00+05:30    0.606327
2012-03-10 05:30:00+05:30   -0.804818
Freq: D, dtype: float64

Converting between time span representations



In [108]:

    
rng = pd.date_range('1/1/2012', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)

ts









    Out[108]:





2012-01-31   -1.609627
2012-02-29   -1.381380
2012-03-31    1.065309
2012-04-30   -0.192158
2012-05-31   -0.742545
Freq: M, dtype: float64



In [109]:

    
ps = ts.to_period()
ps









    Out[109]:





2012-01   -1.609627
2012-02   -1.381380
2012-03    1.065309
2012-04   -0.192158
2012-05   -0.742545
Freq: M, dtype: float64



In [110]:

    
ps.to_timestamp()









    Out[110]:





2012-01-01   -1.609627
2012-02-01   -1.381380
2012-03-01    1.065309
2012-04-01   -0.192158
2012-05-01   -0.742545
Freq: MS, dtype: float64

Converting between period and timestamp enables some convenient arithmetic functions to be used. In the following example, we convert a quarterly frequency with year ending in November to 9am of the end of the month following the quarter end:



In [111]:

    
prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
ts = pd.Series(np.random.randn(len(prng)), prng)

ts.head()









    Out[111]:





1990Q1   -0.583649
1990Q2    0.855731
1990Q3   -0.625833
1990Q4    1.185059
1991Q1   -0.317823
Freq: Q-NOV, dtype: float64



In [112]:

    
ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9

ts.head()









    Out[112]:





1990-03-01 09:00   -0.583649
1990-06-01 09:00    0.855731
1990-09-01 09:00   -0.625833
1990-12-01 09:00    1.185059
1991-03-01 09:00   -0.317823
Freq: H, dtype: float64

Categoricals



In [113]:

    
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})

df



In [114]:

    
df.dtypes









    Out[114]:





id            int64
raw_grade    object
dtype: object



In [115]:

    
df["grade"] = df["raw_grade"].astype("category")

df["grade"]









    Out[115]:





0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): [a, b, e]

Rename the categories to more meaningful names (assigning to Series.cat.categories is inplace!)



In [116]:

    
df["grade"].cat.categories = ["very good", "good", "very bad"]

df



In [117]:

    
# Reorder the categories and simultaneously add the
# missing categories (methods under Series .cat return a new Series per default).

df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])

df



In [118]:

    
# Sorting is per order in the categories, not lexical order.
df.sort_values(by="grade")



In [119]:

    
# Grouping by a categorical column shows also empty categories.
df.groupby("grade").size()









    Out[119]:





grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64

Plotting



In [120]:

    
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))

ts = ts.cumsum()

ts.plot()









    Out[120]:





<matplotlib.axes._subplots.AxesSubplot at 0x8d607b8>



In [121]:

    
# On DataFrame, plot() is a convenience to plot all of the columns with labels:

df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
                  columns=['A', 'B', 'C', 'D'])
df = df.cumsum()

df.plot(grid=True)









    Out[121]:





<matplotlib.axes._subplots.AxesSubplot at 0x8f4ecf8>

Data Read and Write



In [122]:

    
df[:4]

to_csv and read_csv



In [123]:

    
# writing it to csv
df.to_csv("dataset/df_as_csv.csv")



In [124]:

    
# reading it 
df2 = pd.read_csv("dataset/df_as_csv.csv")
df2.head()

for hdfs, to_hdf and read_hdf



In [125]:

    
df.to_hdf('foo.h5','df')



In [126]:

    
pd.read_hdf('foo.h5','df').head()

For Excel, to_excel and read_excel



In [127]:

    
df.to_excel('foo.xlsx', sheet_name='Sheet1')



In [129]:

    
pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA']).head()



In [ ]:

	A	B	C	D
count	6.000000	6.000000	6.000000	6.000000
mean	-0.146447	-0.369063	-0.107337	-0.078364
std	1.544346	0.924431	0.933920	0.898054
min	-2.019862	-2.055205	-1.356455	-1.391032
25%	-1.264676	-0.363364	-0.640825	-0.578846
50%	-0.107127	-0.248860	-0.016415	0.051218
75%	0.610051	-0.081938	0.164450	0.331617
max	2.156849	0.766377	1.369486	1.182199

	A	B	C	D
0	-0.770577	-0.170541	-1.076229	-0.451980
1	1.916760	0.572713	1.164140	0.632860
2	-1.005490	-1.477879	-0.110988	0.870474
3	-0.316136	0.618400	-1.937116	-0.129132
4	0.175017	-0.135414	-0.088902	1.275741
5	0.385744	-0.969253	-0.307694	0.746990
6	1.229319	-1.976747	-0.243775	0.166781
7	-0.940357	-0.894759	0.202037	0.233910

	A	B	C	D
0	-0.770577	-0.170541	-1.076229	-0.451980
1	1.916760	0.572713	1.164140	0.632860
2	-1.005490	-1.477879	-0.110988	0.870474
3	-0.316136	0.618400	-1.937116	-0.129132
4	0.175017	-0.135414	-0.088902	1.275741
5	0.385744	-0.969253	-0.307694	0.746990
6	1.229319	-1.976747	-0.243775	0.166781
7	-0.940357	-0.894759	0.202037	0.233910

	C	D
A
bar	-0.002539	-0.801601
foo	1.881491	-0.501767

		C	D
A	B
bar	one	-1.189741	-0.096997
	three	0.103044	0.251160
	two	1.084158	-0.955764
foo	one	0.270598	0.284141
	three	-1.174013	1.183452
	two	2.784906	-1.969361

	A	B	C	D
2013-01-01	-0.005574	-0.385657	-0.193270	0.264901
2013-01-02	-0.208680	-0.296488	-0.790010	-1.391032
2013-01-03	-1.616674	-0.042173	-1.356455	-0.717640
2013-01-04	2.156849	-0.201232	0.165786	-0.162465
2013-01-05	-2.019862	-2.055205	1.369486	1.182199
2013-01-06	0.815260	0.766377	0.160439	0.353855

	A	B	C	D	E	F
0	1.0	2013-01-02	1.0	3	test	foo
1	1.0	2013-01-02	1.0	3	train	foo
2	1.0	2013-01-02	1.0	3	test	foo
3	1.0	2013-01-02	1.0	3	train	foo

	A	B	C	D	F
2013-01-01	0.000000	-0.385657	-0.193270	0.264901	NaN
2013-01-02	-0.208680	-0.296488	-0.790010	-1.391032	1.0
2013-01-03	-1.616674	-0.042173	-1.356455	-0.717640	2.0
2013-01-04	2.156849	-0.201232	0.165786	-0.162465	3.0
2013-01-05	-2.019862	-2.055205	1.369486	1.182199	4.0
2013-01-06	0.815260	0.766377	0.160439	0.353855	5.0

	A	B	C	D	F	E
2013-01-01	False	False	False	False	True	True
2013-01-02	False	False	False	False	False	False
2013-01-03	False	False	False	False	False	True
2013-01-04	False	False	False	False	False	True

	0	1	2	3
0	0.862392	-0.487448	1.767044	1.619584
1	-0.192533	0.452153	0.160491	0.163961
2	0.646025	-0.710062	-0.764748	2.061172
3	-1.356019	-0.077627	-0.947133	0.952742
4	0.883336	1.352857	-1.960570	1.267786
5	1.811561	-2.001506	-0.595290	-0.820276
6	-2.535877	-0.415728	-0.000989	-1.231573
7	-0.021901	1.382943	-0.957129	-0.228767
8	0.813857	-0.864718	1.981206	1.672659
9	-0.443268	0.921847	-0.548603	0.190629

	A	B	C	D
0	foo	one	0.206509	-0.437572
1	bar	one	-1.189741	-0.096997
2	foo	two	1.289879	-0.130252
3	bar	three	0.103044	0.251160
4	foo	two	1.495027	-1.839108
5	bar	two	1.084158	-0.955764
6	foo	one	0.064089	0.721713
7	foo	three	-1.174013	1.183452

		A	B
first	second
bar	one	0.077103	0.071665
bar	two	0.243469	0.789529
baz	one	-0.452906	0.148716
baz	two	-0.829278	-1.180111
foo	one	-0.251384	-0.004602
foo	two	-0.857481	0.512469
qux	one	1.660127	-1.580966
qux	two	0.521759	-1.693887

	A	B	C	D	E
0	one	A	foo	0.869958	-2.824236
1	one	B	foo	-1.263218	0.101729
2	two	C	foo	-0.743723	1.409101
3	three	A	bar	0.292353	0.282451
4	one	B	bar	-0.091658	-0.752110
5	one	C	bar	1.295819	-0.448113
6	two	A	foo	1.573841	0.196957
7	three	B	foo	-0.645814	1.409150
8	one	C	foo	0.635127	-1.389018
9	one	A	bar	-1.174726	-0.077247
10	two	B	bar	1.264011	-1.072017
11	three	C	bar	1.441732	0.543449

	id	raw_grade	grade
0	1	a	very good
1	2	b	good
2	3	b	good
3	4	a	very good
4	5	a	very good
5	6	e	very bad

	A	B	C	D
2000-01-01	0.383624	0.429101	0.743808	1.577623
2000-01-02	-0.609680	0.453151	1.483189	0.865432
2000-01-03	-0.652463	0.116940	2.300037	-0.406062
2000-01-04	-1.804128	0.703522	1.802338	-0.694557