In [148]:
import pandas as pd
import numpy as np

In [4]:
obj2 = pd.Series(arange(5), index=['a','b','c','d','e'])

In [5]:
obj2.values


Out[5]:
array([0, 1, 2, 3, 4])

In [6]:
obj2.index


Out[6]:
Index([u'a', u'b', u'c', u'd', u'e'], dtype='object')

In [7]:
obj2


Out[7]:
a    0
b    1
c    2
d    3
e    4
dtype: int32

In [8]:
obj2[['a','c']]


Out[8]:
a    0
c    2
dtype: int32

In [10]:
obj2[obj2<2]


Out[10]:
a    0
b    1
dtype: int32

In [11]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [20]:
obj3=pd.Series(sdata)

In [21]:
states = ['California', 'Ohio', 'Oregon', 'Texas']

In [22]:
obj4 = pd.Series(sdata, index=states)

In [15]:
obj4


Out[15]:
California      NaN
Ohio          35000
Oregon        16000
Texas         71000
dtype: float64

In [16]:
obj4.isnull()


Out[16]:
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [17]:
pd.notnull(obj4)


Out[17]:
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [18]:
pd.isnull(obj4)


Out[18]:
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [24]:
obj4


Out[24]:
California      NaN
Ohio          35000
Oregon        16000
Texas         71000
dtype: float64

In [25]:
obj3+ obj4


Out[25]:
California       NaN
Ohio           70000
Oregon         32000
Texas         142000
Utah             NaN
dtype: float64

In [26]:
obj3


Out[26]:
Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [27]:
obj4


Out[27]:
California      NaN
Ohio          35000
Oregon        16000
Texas         71000
dtype: float64

In [28]:
obj4['California']=40000

In [29]:
obj3+obj4


Out[29]:
California       NaN
Ohio           70000
Oregon         32000
Texas         142000
Utah             NaN
dtype: float64

In [31]:
obj4.name = 'population'
obj4.index.name='state'

In [32]:
obj4


Out[32]:
state
California    40000
Ohio          35000
Oregon        16000
Texas         71000
Name: population, dtype: float64

In [40]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}

In [41]:
data


Out[41]:
{'pop': [1.5, 1.7, 3.6, 2.4, 2.9],
 'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002]}

In [42]:
frame = pd.DataFrame(data)

In [36]:
frame


Out[36]:
pop state year
0 1.5 Ohio 2000
1 1.7 Ohio 2001
2 3.6 Ohio 2002
3 2.4 Nevada 2001
4 2.9 Nevada 2002

5 rows × 3 columns


In [50]:
pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'])


Out[50]:
year state pop debt
0 2000 Ohio 1.5 NaN
1 2001 Ohio 1.7 NaN
2 2002 Ohio 3.6 NaN
3 2001 Nevada 2.4 NaN
4 2002 Nevada 2.9 NaN

5 rows × 4 columns


In [51]:
frame['year']


Out[51]:
0    2000
1    2001
2    2002
3    2001
4    2002
Name: year, dtype: int64

In [52]:
frame.year


Out[52]:
0    2000
1    2001
2    2002
3    2001
4    2002
Name: year, dtype: int64

In [53]:
frame.ix[3]


Out[53]:
pop         2.4
state    Nevada
year       2001
Name: 3, dtype: object

In [56]:
'pop' in frame.columns


Out[56]:
True

In [59]:
3 in frame.index


Out[59]:
True

In [64]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])

In [63]:
obj3.reindex(range(6), method='ffill')


Out[63]:
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [65]:
obj3


Out[65]:
0      blue
2    purple
4    yellow
dtype: object

In [67]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])

In [69]:
obj.drop('b')


Out[69]:
a    0
c    2
d    3
e    4
dtype: float64

In [71]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
    index=['Ohio', 'Colorado', 'Utah', 'New York'],
    columns=['one', 'two', 'three', 'four'])

In [72]:
data


Out[72]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15

4 rows × 4 columns


In [75]:
data.drop(['one'], axis=1)


Out[75]:
two three four
Ohio 1 2 3
Colorado 5 6 7
Utah 9 10 11
New York 13 14 15

4 rows × 3 columns


In [77]:
from pandas import *
obj = Series(np.arange(4.), index=['a', 'b', 'c', 'd'])

In [81]:
obj[['a','c']]


Out[81]:
a    0
c    2
dtype: float64

In [82]:
obj[2:4]


Out[82]:
c    2
d    3
dtype: float64

In [83]:
obj


Out[83]:
a    0
b    1
c    2
d    3
dtype: float64

In [84]:
obj[obj<2]


Out[84]:
a    0
b    1
dtype: float64

In [85]:
data = DataFrame(np.arange(16).reshape((4, 4)),
    index=['Ohio', 'Colorado', 'Utah', 'New York'],
    columns=['one', 'two', 'three', 'four'])

In [86]:
data


Out[86]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15

4 rows × 4 columns


In [88]:
data[['two','four']]


Out[88]:
two four
Ohio 1 3
Colorado 5 7
Utah 9 11
New York 13 15

4 rows × 2 columns


In [90]:
data[data['three']>5]


Out[90]:
one two three four
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15

3 rows × 4 columns


In [91]:
data <5


Out[91]:
one two three four
Ohio True True True True
Colorado True False False False
Utah False False False False
New York False False False False

4 rows × 4 columns


In [92]:
data [ data <5 ]


Out[92]:
one two three four
Ohio 0 1 2 3
Colorado 4 NaN NaN NaN
Utah NaN NaN NaN NaN
New York NaN NaN NaN NaN

4 rows × 4 columns


In [93]:
data.ix[['Colorado', 'Utah'], [3, 0, 1]]


Out[93]:
four one two
Colorado 7 4 5
Utah 11 8 9

2 rows × 3 columns


In [94]:
data


Out[94]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15

4 rows × 4 columns


In [95]:
data.ix[:'Utah', 'two']


Out[95]:
Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int32

In [96]:
data.ix[data.three > 5, :3]


Out[96]:
one two three
Colorado 4 5 6
Utah 8 9 10
New York 12 13 14

3 rows × 3 columns


In [97]:
s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])

In [98]:
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])

In [101]:
s2+s1


Out[101]:
a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [102]:
df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
    index=['Ohio', 'Texas', 'Colorado'])

In [103]:
df2 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
    index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [104]:
df1+df2


Out[104]:
b c d e
Colorado NaN NaN NaN NaN
Ohio 3 NaN 6 NaN
Oregon NaN NaN NaN NaN
Texas 9 NaN 12 NaN
Utah NaN NaN NaN NaN

5 rows × 4 columns


In [105]:
df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))

In [106]:
df1


Out[106]:
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11

3 rows × 4 columns


In [107]:
df2


Out[107]:
a b c d e
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19

4 rows × 5 columns


In [108]:
df1+df2


Out[108]:
a b c d e
0 0 2 4 6 NaN
1 9 11 13 15 NaN
2 18 20 22 24 NaN
3 NaN NaN NaN NaN NaN

4 rows × 5 columns


In [109]:
df1.add(df2, fill_value=0)


Out[109]:
a b c d e
0 0 2 4 6 4
1 9 11 13 15 9
2 18 20 22 24 14
3 15 16 17 18 19

4 rows × 5 columns


In [112]:
df1.reindex(columns=df2.columns, fill_value=0)


Out[112]:
a b c d e
0 0 1 2 3 0
1 4 5 6 7 0
2 8 9 10 11 0

3 rows × 5 columns


In [111]:
df1


Out[111]:
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11

3 rows × 4 columns


In [113]:
arr = np.arange(12.).reshape((3, 4))

In [114]:
arr[0]


Out[114]:
array([ 0.,  1.,  2.,  3.])

In [115]:
arr - arr[0]


Out[115]:
array([[ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.],
       [ 8.,  8.,  8.,  8.]])

In [116]:
arr


Out[116]:
array([[  0.,   1.,   2.,   3.],
       [  4.,   5.,   6.,   7.],
       [  8.,   9.,  10.,  11.]])

In [117]:
frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
    index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [118]:
frame.ix[0]


Out[118]:
b    0
d    1
e    2
Name: Utah, dtype: float64

In [119]:
frame


Out[119]:
b d e
Utah 0 1 2
Ohio 3 4 5
Texas 6 7 8
Oregon 9 10 11

4 rows × 3 columns


In [124]:
frame = DataFrame(np.random.randn(4, 3), columns=list('bde'),
    index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [126]:
np.abs(frame)


Out[126]:
b d e
Utah 0.411465 0.933316 1.051063
Ohio 1.247883 0.034203 0.628053
Texas 1.347686 0.963852 0.791951
Oregon 0.422665 1.932881 0.418523

4 rows × 3 columns


In [127]:
f = lambda x: x.max() - x.min()

In [128]:
frame.apply(f)


Out[128]:
b    1.770351
d    2.896733
e    1.679115
dtype: float64

In [129]:
frame


Out[129]:
b d e
Utah 0.411465 -0.933316 -1.051063
Ohio 1.247883 0.034203 0.628053
Texas 1.347686 -0.963852 -0.791951
Oregon -0.422665 1.932881 -0.418523

4 rows × 3 columns


In [130]:
format = lambda x: '%.2f' % x

In [131]:
frame.applymap(format)


Out[131]:
b d e
Utah 0.41 -0.93 -1.05
Ohio 1.25 0.03 0.63
Texas 1.35 -0.96 -0.79
Oregon -0.42 1.93 -0.42

4 rows × 3 columns


In [132]:
frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
    columns=['d', 'a', 'b', 'c'])

In [137]:
frame.sort_index(axis=1)


Out[137]:
a b c d
three 1 2 3 0
one 5 6 7 4

2 rows × 4 columns


In [134]:
frame


Out[134]:
d a b c
three 0 1 2 3
one 4 5 6 7

2 rows × 4 columns


In [138]:
frame.sort_index(axis=1, ascending=False)


Out[138]:
d c b a
three 0 3 2 1
one 4 7 6 5

2 rows × 4 columns


In [139]:
obj = Series([7, -5, 7, 4, 2, 0, 4])

In [140]:
obj.rank()


Out[140]:
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [141]:
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
    [np.nan, np.nan], [0.75, -1.3]],
    index=['a', 'b', 'c', 'd'],
    columns=['one', 'two'])

In [144]:
df.sort(axis=1)


Out[144]:
one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3

4 rows × 2 columns


In [147]:
df.sum(axis=1)


Out[147]:
a    1.40
b    2.60
c     NaN
d   -0.55
dtype: float64

In [149]:
data = DataFrame({'Qu1': [1, 3, 4, 3, 4],
    'Qu2': [2, 3, 1, 2, 3],
    'Qu3': [1, 5, 2, 4, 4]})

In [150]:
data


Out[150]:
Qu1 Qu2 Qu3
0 1 2 1
1 3 3 5
2 4 1 2
3 3 2 4
4 4 3 4

5 rows × 3 columns


In [153]:
data.apply(pd.value_counts).fillna(0)


Out[153]:
Qu1 Qu2 Qu3
1 1 1 1
2 0 2 1
3 2 2 0
4 2 0 2
5 0 0 1

5 rows × 3 columns


In [155]:
data.apply(pd.value_counts, axis=1).fillna(0)


Out[155]:
1 2 3 4 5
0 2 1 0 0 0
1 0 0 2 0 1
2 1 1 0 1 0
3 0 1 1 1 0
4 0 0 1 2 0

5 rows × 5 columns


In [156]:
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [157]:
string_data.isnull()


Out[157]:
0    False
1    False
2     True
3    False
dtype: bool

In [158]:
string_data.fillna('none')


Out[158]:
0     aardvark
1    artichoke
2         none
3      avocado
dtype: object

In [159]:
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])

In [160]:
data.dropna()


Out[160]:
0    1.0
2    3.5
4    7.0
dtype: float64

In [162]:
data[data.notnull()]


Out[162]:
0    1.0
2    3.5
4    7.0
dtype: float64

In [163]:
data = DataFrame([[1., 6.5, 3.], [1., NA, NA],
    [NA, NA, NA], [NA, 6.5, 3.]])

In [164]:
data


Out[164]:
0 1 2
0 1 6.5 3
1 1 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3

4 rows × 3 columns


In [165]:
data.dropna()


Out[165]:
0 1 2
0 1 6.5 3

1 rows × 3 columns


In [166]:
data.dropna(how='all')


Out[166]:
0 1 2
0 1 6.5 3
1 1 NaN NaN
3 NaN 6.5 3

3 rows × 3 columns


In [169]:
data.dropna(axis=1, how='all')


Out[169]:
0 1 2
0 1 6.5 3
1 1 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3

4 rows × 3 columns


In [170]:
df = DataFrame(np.random.randn(7, 3))

In [176]:
df.ix[:4, 1] = NA; df.ix[:2, 2] = NA; df.ix[0,0]= NA

In [177]:
df


Out[177]:
0 1 2
0 NaN NaN NaN
1 -1.117429 NaN NaN
2 -1.097185 NaN NaN
3 0.351192 NaN -2.874015
4 0.768508 NaN 0.101283
5 0.109518 1.036179 0.215995
6 -0.865862 0.459417 0.180664

7 rows × 3 columns


In [179]:
df.dropna(axis=0, how='all')


Out[179]:
0 1 2
1 -1.117429 NaN NaN
2 -1.097185 NaN NaN
3 0.351192 NaN -2.874015
4 0.768508 NaN 0.101283
5 0.109518 1.036179 0.215995
6 -0.865862 0.459417 0.180664

6 rows × 3 columns


In [182]:
df.dropna(thresh=1)


Out[182]:
0 1 2
1 -1.117429 NaN NaN
2 -1.097185 NaN NaN
3 0.351192 NaN -2.874015
4 0.768508 NaN 0.101283
5 0.109518 1.036179 0.215995
6 -0.865862 0.459417 0.180664

6 rows × 3 columns


In [ ]: