In [6]:
import pandas as pd
import numpy as np
import numpy as np

"""10 Minutes to Pandas: http://pandas.pydata.org/pandas-docs/stable/10min.html#missing-data"""

In [38]:
# Object Creation
import pandas as pd
import numpy as np

# Series data
obj1 = pd.Series([1, 3, 5, np.nan, 6, 8])
print('\n+=== Series Data: \n', obj1)

# Date data
obj2 = pd.date_range("20130225", periods=10)
print('\n+=== Date Data: \n', obj2)

# Format data
obj3 = pd.DataFrame(np.random.randn(10,4), index=obj2, columns=list('ABCD'))
print('\n+=== Format Data: \n', obj3)

# Series-like format data
obj4 = pd.DataFrame({
    'A': 1,
    'B': pd.date_range("20130225", periods=4),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3] * 4, dtype='int32'),
    'E': pd.Categorical(['Test', 'Train', 'Test2', 'Train2']),
    'F': 'Foo'
})
print('\n+=== Series-like format Data: \n', obj4, '\n', obj4.dtypes)


+=== Series Data: 
 0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

+=== Date Data: 
 DatetimeIndex(['2013-02-25', '2013-02-26', '2013-02-27', '2013-02-28',
               '2013-03-01', '2013-03-02', '2013-03-03', '2013-03-04',
               '2013-03-05', '2013-03-06'],
              dtype='datetime64[ns]', freq='D')

+=== Format Data: 
                    A         B         C         D
2013-02-25 -0.459068 -0.987549 -1.447412  3.082147
2013-02-26  3.273940  1.187620 -1.263051 -0.509848
2013-02-27  0.416887 -0.889416 -0.372518 -0.373707
2013-02-28  0.232641  0.048246 -0.962750 -0.862494
2013-03-01 -0.234853  0.392017 -1.066371  0.265463
2013-03-02  0.134706  0.229381  1.728758 -0.143046
2013-03-03 -0.065125 -0.293687  0.651796 -1.062646
2013-03-04  0.101482 -2.598790  1.009392 -0.516823
2013-03-05 -0.705383  0.163391  2.204861 -0.116296
2013-03-06  0.658669 -0.579698  1.311628 -0.013751

+=== Series-like format Data: 
    A          B    C  D       E    F
0  1 2013-02-25  1.0  3    Test  Foo
1  1 2013-02-26  1.0  3   Train  Foo
2  1 2013-02-27  1.0  3   Test2  Foo
3  1 2013-02-28  1.0  3  Train2  Foo 
 A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [18]:
# Viewing Data
import pandas as pd
import numpy as np
import numpy as np


# Default
dates = pd.date_range("20130225", periods=5)
df = pd.DataFrame(np.random.randn(5,4), index=dates, columns=list('ABCD'))

print('\n+=== DataFrame Head\n', df.head())
print('\n+=== DataFrame Tail\n', df.tail(3))
print('\n+=== DataFrame Index\n', df.index)
print('\n+=== DataFrame Columns\n', df.columns)
print('\n+=== DataFrame Values\n', df.values)
print('\n+=== DataFrame Describe\n', df.describe())
print('\n+=== DataFrame Transposing your data\n', df.T)
print('\n+=== DataFrame Sorting by an axis\n', df.sort_index(axis=1, ascending=False))
print('\n+=== DataFrame Sorting by values\n', df.sort_values(by='B'))


+=== DataFrame Head
                    A         B         C         D
2013-02-25 -0.066709  1.691487  0.146585 -1.083411
2013-02-26  0.769033  0.418229 -0.446144 -0.281829
2013-02-27 -1.802976  0.039549 -0.326418  1.686606
2013-02-28 -1.143043  0.842802 -0.037886 -0.648760
2013-03-01 -1.042495 -0.717522  0.275440  1.116691

+=== DataFrame Tail
                    A         B         C         D
2013-02-27 -1.802976  0.039549 -0.326418  1.686606
2013-02-28 -1.143043  0.842802 -0.037886 -0.648760
2013-03-01 -1.042495 -0.717522  0.275440  1.116691

+=== DataFrame Index
 DatetimeIndex(['2013-02-25', '2013-02-26', '2013-02-27', '2013-02-28',
               '2013-03-01'],
              dtype='datetime64[ns]', freq='D')

+=== DataFrame Columns
 Index(['A', 'B', 'C', 'D'], dtype='object')

+=== DataFrame Values
 [[-0.06670928  1.6914875   0.14658523 -1.08341052]
 [ 0.76903279  0.41822925 -0.44614434 -0.28182898]
 [-1.80297649  0.03954869 -0.32641831  1.68660628]
 [-1.14304273  0.8428022  -0.03788611 -0.64876019]
 [-1.04249507 -0.71752217  0.27544041  1.11669054]]

+=== DataFrame Describe
               A         B         C         D
count  5.000000  5.000000  5.000000  5.000000
mean  -0.657238  0.454909 -0.077685  0.157859
std    1.009965  0.898339  0.305863  1.187554
min   -1.802976 -0.717522 -0.446144 -1.083411
25%   -1.143043  0.039549 -0.326418 -0.648760
50%   -1.042495  0.418229 -0.037886 -0.281829
75%   -0.066709  0.842802  0.146585  1.116691
max    0.769033  1.691487  0.275440  1.686606

+=== DataFrame Transposing your data
    2013-02-25  2013-02-26  2013-02-27  2013-02-28  2013-03-01
A   -0.066709    0.769033   -1.802976   -1.143043   -1.042495
B    1.691487    0.418229    0.039549    0.842802   -0.717522
C    0.146585   -0.446144   -0.326418   -0.037886    0.275440
D   -1.083411   -0.281829    1.686606   -0.648760    1.116691

+=== DataFrame Sorting by an axis
                    D         C         B         A
2013-02-25 -1.083411  0.146585  1.691487 -0.066709
2013-02-26 -0.281829 -0.446144  0.418229  0.769033
2013-02-27  1.686606 -0.326418  0.039549 -1.802976
2013-02-28 -0.648760 -0.037886  0.842802 -1.143043
2013-03-01  1.116691  0.275440 -0.717522 -1.042495

+=== DataFrame Sorting by values
                    A         B         C         D
2013-03-01 -1.042495 -0.717522  0.275440  1.116691
2013-02-27 -1.802976  0.039549 -0.326418  1.686606
2013-02-26  0.769033  0.418229 -0.446144 -0.281829
2013-02-28 -1.143043  0.842802 -0.037886 -0.648760
2013-02-25 -0.066709  1.691487  0.146585 -1.083411

In [21]:
# Getting
import pandas as pd
import numpy as np
import numpy as np


# Default
dates = pd.date_range("20130225", periods=5)
df = pd.DataFrame(np.random.randn(5,4), index=dates, columns=list('ABCD'))

print('\n+=== DataFrame get []\n', df['A'])
print('\n+=== DataFrame get []\n', df[0: 3])
print('\n+=== DataFrame get []\n', df['2013-02-26': '2013-02-28'])


+=== DataFrame get []
 2013-02-25    0.125750
2013-02-26   -0.809164
2013-02-27   -1.111902
2013-02-28   -0.557638
2013-03-01   -0.659017
Freq: D, Name: A, dtype: float64

+=== DataFrame get []
                    A         B         C         D
2013-02-25  0.125750  0.627943 -0.232457  1.423480
2013-02-26 -0.809164  1.085099 -1.423564  0.507384
2013-02-27 -1.111902 -0.088581  1.128016  0.224681

+=== DataFrame get []
                    A         B         C         D
2013-02-26 -0.809164  1.085099 -1.423564  0.507384
2013-02-27 -1.111902 -0.088581  1.128016  0.224681
2013-02-28 -0.557638 -2.289267 -2.241341  0.466005

In [25]:
# Selection by Label
import pandas as pd
import numpy as np
import numpy as np


# Default
dates = pd.date_range("20130225", periods=5)
df = pd.DataFrame(np.random.randn(5,4), index=dates, columns=list('ABCD'))

print('\n+=== DataFrame df.loc[dates[0]]\n', df.loc[dates[0]])
print('\n+=== DataFrame df.loc[:,["A","B"]]\n', df.loc[:,['A','B']])
print('\n+=== DataFrame df.loc["2013-02-26":"2013-02-28",["A","B"]]\n', df.loc['2013-02-26':'2013-02-28',['A','B']])
print('\n+=== DataFrame df.loc["2013-02-26",["A","B"]]\n', df.loc['2013-02-26',['A','B']])
print('\n+=== DataFrame df.loc[dates[0],"A"]\n', df.loc[dates[0],'A'])
print('\n+=== DataFrame df.at[dates[0],"A"]\n', df.at[dates[0],'A'])


+=== DataFrame df.loc[dates[0]]
 A   -0.829786
B   -0.519569
C    0.092754
D    1.946923
Name: 2013-02-25 00:00:00, dtype: float64

+=== DataFrame df.loc[:,["A","B"]]
                    A         B
2013-02-25 -0.829786 -0.519569
2013-02-26  0.055743 -0.268200
2013-02-27 -0.361156  0.373584
2013-02-28 -0.701896  1.979653
2013-03-01 -0.289689 -0.431045

+=== DataFrame df.loc["2013-02-26":"2013-02-28",["A","B"]]
                    A         B
2013-02-26  0.055743 -0.268200
2013-02-27 -0.361156  0.373584
2013-02-28 -0.701896  1.979653

+=== DataFrame df.loc["2013-02-26",["A","B"]]
 A    0.055743
B   -0.268200
Name: 2013-02-26 00:00:00, dtype: float64

+=== DataFrame df.loc[dates[0],"A"]
 -0.829786185417

+=== DataFrame df.at[dates[0],"A"]
 -0.829786185417

In [31]:
# Selection by Position
import pandas as pd
import numpy as np
import numpy as np


# Default
dates = pd.date_range("20130225", periods=5)
df = pd.DataFrame(np.random.randn(5,4), index=dates, columns=list('ABCD'))


print('\n+=== DataFrame df.iloc[3]\n', df.iloc[3])
print('\n+=== DataFrame df.iloc[3:5,0:2]\n', df.iloc[3:5,0:2])
print('\n+=== DataFrame df.iloc[[1,2,4],[0,2]]\n', df.iloc[[1,2,4],[0,2]])
print('\n+=== DataFrame df.iloc[1:3,:]\n', df.iloc[1:3,:])
print('\n+=== DataFrame df.iloc[:,1:3]\n', df.iloc[:,1:3])
print('\n+=== DataFrame df.iloc[1,1]\n', df.iloc[1,1])
print('\n+=== DataFrame df.iat[1,1]\n', df.iat[1,1])


+=== DataFrame df.iloc[3]
 A   -0.037650
B   -1.906120
C    0.374508
D   -1.739052
Name: 2013-02-28 00:00:00, dtype: float64

+=== DataFrame df.iloc[3:5,0:2]
                    A         B
2013-02-28 -0.037650 -1.906120
2013-03-01 -0.950265  0.090712

+=== DataFrame df.iloc[[1,2,4],[0,2]]
                    A         C
2013-02-26  0.994786  1.150232
2013-02-27 -1.301550 -0.211548
2013-03-01 -0.950265  0.918038

+=== DataFrame df.iloc[1:3,:]
                    A         B         C         D
2013-02-26  0.994786 -1.780810  1.150232  0.014797
2013-02-27 -1.301550  0.104018 -0.211548 -0.917351

+=== DataFrame df.iloc[:,1:3]
                    B         C
2013-02-25  0.276956  1.438158
2013-02-26 -1.780810  1.150232
2013-02-27  0.104018 -0.211548
2013-02-28 -1.906120  0.374508
2013-03-01  0.090712  0.918038

+=== DataFrame df.iloc[1,1]
 -1.78081029199

+=== DataFrame df.iat[1,1]
 -1.78081029199

In [37]:
# Boolean Indexing
import pandas as pd
import numpy as np
import numpy as np


# Default
dates = pd.date_range("20130225", periods=5)
df = pd.DataFrame(np.random.randn(5,4), index=dates, columns=list('ABCD'))


print('\n+=== DataFrame df[df.A > 0]\n', df[df.A > 0])
print('\n+=== DataFrame df[df > 0]\n', df[df > 0])

df2 = df.copy()
df2['E'] = ['one', 'one','two','three','four']
print('\n+=== DataFrame copy and insert\n', df2)
print('\n+=== DataFrame using isin()\n', df2[df2['E'].isin(['two','four'])])


+=== DataFrame df[df.A > 0]
                    A         B         C         D
2013-02-26  1.299022 -0.588605  0.627271 -0.808533
2013-02-27  0.300711  0.014402  0.509765 -0.213479
2013-02-28  1.289546 -0.348935  0.297340 -1.061342

+=== DataFrame df[df > 0]
                    A         B         C         D
2013-02-25       NaN  0.244766  0.179876  1.327536
2013-02-26  1.299022       NaN  0.627271       NaN
2013-02-27  0.300711  0.014402  0.509765       NaN
2013-02-28  1.289546       NaN  0.297340       NaN
2013-03-01       NaN  0.626660       NaN  0.291464

+=== DataFrame copy and insert
                    A         B         C         D      E
2013-02-25 -0.203309  0.244766  0.179876  1.327536    one
2013-02-26  1.299022 -0.588605  0.627271 -0.808533    one
2013-02-27  0.300711  0.014402  0.509765 -0.213479    two
2013-02-28  1.289546 -0.348935  0.297340 -1.061342  three
2013-03-01 -0.919772  0.626660 -0.396494  0.291464   four

+=== DataFrame using isin()
                    A         B         C         D     E
2013-02-27  0.300711  0.014402  0.509765 -0.213479   two
2013-03-01 -0.919772  0.626660 -0.396494  0.291464  four

In [ ]: