In [10]:
#第5章 pandas入门
from pandas import Series, DataFrame
import pandas as pd

In [11]:
##pandas的数据结构介绍

In [12]:
###Series

In [13]:
obj = Series([4,7,-5,3])

In [14]:
obj


Out[14]:
0    4
1    7
2   -5
3    3

In [15]:
obj.values


Out[15]:
array([ 4,  7, -5,  3], dtype=int64)

In [16]:
obj.index


Out[16]:
Int64Index([0, 1, 2, 3], dtype=int64)

In [17]:
obj2 = Series([4,7,-5,3], index=['d','b','a','c'])

In [18]:
obj2


Out[18]:
d    4
b    7
a   -5
c    3

In [19]:
obj2.index


Out[19]:
Index([d, b, a, c], dtype=object)

In [20]:
obj2['d'] = 6

In [21]:
obj2


Out[21]:
d    6
b    7
a   -5
c    3

In [22]:
obj2[['c','a','d']]


Out[22]:
c    3
a   -5
d    6

In [23]:
obj2[obj2>0]


Out[23]:
d    6
b    7
c    3

In [24]:
obj2*2


Out[24]:
d    12
b    14
a   -10
c     6

In [25]:
np.exp(obj2)


Out[25]:
d     403.428793
b    1096.633158
a       0.006738
c      20.085537

In [26]:
'b' in obj2


Out[26]:
True

In [27]:
'e' in obj2


Out[27]:
False

In [28]:
sdata = {'Ohio':35000, 'Texas':71000, 'Oregon':16000, 'Utah':5000}

In [29]:
obj3 = Series(sdata)

In [30]:
obj3


Out[30]:
Ohio      35000
Oregon    16000
Texas     71000
Utah       5000

In [31]:
obj3.index


Out[31]:
Index([Ohio, Oregon, Texas, Utah], dtype=object)

In [32]:
obj3.values


Out[32]:
array([35000, 16000, 71000,  5000], dtype=int64)

In [33]:
states = ['California','Ohio','Oregon','Texas']

In [34]:
obj4 = Series(sdata, index=states)

In [35]:
obj4


Out[35]:
California      NaN
Ohio          35000
Oregon        16000
Texas         71000

In [36]:
pd.isnull(obj4)


Out[36]:
California     True
Ohio          False
Oregon        False
Texas         False

In [37]:
pd.notnull(obj4)


Out[37]:
California    False
Ohio           True
Oregon         True
Texas          True

In [38]:
obj4.isnull()


Out[38]:
California     True
Ohio          False
Oregon        False
Texas         False

In [39]:
obj3 + obj4


Out[39]:
California       NaN
Ohio           70000
Oregon         32000
Texas         142000
Utah             NaN

In [40]:
obj4.name = 'population'

In [41]:
obj4.index.name = 'state'

In [42]:
obj4


Out[42]:
state
California      NaN
Ohio          35000
Oregon        16000
Texas         71000
Name: population

In [43]:
###Series的索引可以通过赋值的方式就地修改

In [44]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']

In [45]:
obj


Out[45]:
Bob      4
Steve    7
Jeff    -5
Ryan     3

In [46]:
#DataFrame

In [47]:
data = {'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year':[2000, 2001, 2002, 2001, 2002],
        'pop':[1.5, 1.7, 3.6, 2.4, 2.9]}

In [48]:
frame = DataFrame(data)

In [49]:
frame


Out[49]:
pop state year
0 1.5 Ohio 2000
1 1.7 Ohio 2001
2 3.6 Ohio 2002
3 2.4 Nevada 2001
4 2.9 Nevada 2002

In [50]:
DataFrame(data, columns=['year', 'state', 'pop'])


Out[50]:
year state pop
0 2000 Ohio 1.5
1 2001 Ohio 1.7
2 2002 Ohio 3.6
3 2001 Nevada 2.4
4 2002 Nevada 2.9

In [51]:
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                   index=['one', 'two', 'three', 'four', 'five'])

In [52]:
frame2


Out[52]:
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 NaN
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 NaN
five 2002 Nevada 2.9 NaN

In [53]:
frame2.columns


Out[53]:
Index([year, state, pop, debt], dtype=object)

In [54]:
frame2['state']


Out[54]:
one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state

In [55]:
frame2.year


Out[55]:
one      2000
two      2001
three    2002
four     2001
five     2002
Name: year

In [56]:
frame2.ix['three']


Out[56]:
year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three

In [57]:
frame2.debt = 16.5

In [58]:
frame2


Out[58]:
year state pop debt
one 2000 Ohio 1.5 16.5
two 2001 Ohio 1.7 16.5
three 2002 Ohio 3.6 16.5
four 2001 Nevada 2.4 16.5
five 2002 Nevada 2.9 16.5

In [59]:
frame2['debt'] = np.arange(5.)

In [60]:
frame2


Out[60]:
year state pop debt
one 2000 Ohio 1.5 0
two 2001 Ohio 1.7 1
three 2002 Ohio 3.6 2
four 2001 Nevada 2.4 3
five 2002 Nevada 2.9 4

In [61]:
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])

In [62]:
frame2.debt = val

In [63]:
frame2


Out[63]:
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 -1.2
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 -1.5
five 2002 Nevada 2.9 -1.7

In [64]:
frame2['eastern'] = frame2.state == 'Ohio'

In [65]:
frame2


Out[65]:
year state pop debt eastern
one 2000 Ohio 1.5 NaN True
two 2001 Ohio 1.7 -1.2 True
three 2002 Ohio 3.6 NaN True
four 2001 Nevada 2.4 -1.5 False
five 2002 Nevada 2.9 -1.7 False

In [66]:
frame2.columns


Out[66]:
Index([year, state, pop, debt, eastern], dtype=object)

In [67]:
del frame2['eastern']

In [68]:
frame2.columns


Out[68]:
Index([year, state, pop, debt], dtype=object)

In [69]:
pop = {'Nevada':{2001:2.4, 2002:2.9},
       'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}}

In [70]:
frame3 = DataFrame(pop)

In [71]:
frame3


Out[71]:
Nevada Ohio
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6

In [72]:
frame3.T


Out[72]:
2000 2001 2002
Nevada NaN 2.4 2.9
Ohio 1.5 1.7 3.6

In [73]:
DataFrame(pop, index=[2001, 2002, 2003])


Out[73]:
Nevada Ohio
2001 2.4 1.7
2002 2.9 3.6
2003 NaN NaN

In [74]:
pdata = {'Ohio':frame3['Ohio'][:-1],
         'Nevada':frame3['Nevada'][:2]}

In [75]:
DataFrame(pdata)


Out[75]:
Nevada Ohio
2000 NaN 1.5
2001 2.4 1.7

In [76]:
frame3.index.name = 'year';frame3.columns.name = 'state'

In [77]:
frame3


Out[77]:
state Nevada Ohio
year
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6

In [78]:
frame3.values


Out[78]:
array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

In [79]:
frame2.values


Out[79]:
array([[2000, Ohio, 1.5, nan],
       [2001, Ohio, 1.7, -1.2],
       [2002, Ohio, 3.6, nan],
       [2001, Nevada, 2.4, -1.5],
       [2002, Nevada, 2.9, -1.7]], dtype=object)

In [80]:
frame2


Out[80]:
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 -1.2
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 -1.5
five 2002 Nevada 2.9 -1.7

In [81]:
#索引对象

In [82]:
obj = Series(range(3), index=['a', 'b','c'])

In [83]:
obj


Out[83]:
a    0
b    1
c    2

In [84]:
index =  obj.index

In [85]:
index


Out[85]:
Index([a, b, c], dtype=object)

In [88]:
index[1:]


Out[88]:
Index([b, c], dtype=object)

In [89]:
index = pd.Index(np.arange(3))

In [90]:
index


Out[90]:
Int64Index([0, 1, 2], dtype=int64)

In [91]:
obj2 = Series([1.2, -2.5, 0], index = index)

In [92]:
obj2


Out[92]:
0    1.2
1   -2.5
2    0.0

In [93]:
frame3


Out[93]:
state Nevada Ohio
year
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6

In [94]:
'Ohio' in frame3.columns


Out[94]:
True

In [95]:
2003 in frame3.index


Out[95]:
False

In [96]:
##基本功能

In [97]:
###重新索引

In [98]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b','a','c'])

In [99]:
obj


Out[99]:
d    4.5
b    7.2
a   -5.3
c    3.6

In [100]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])

In [101]:
obj2


Out[101]:
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN

In [102]:
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)


Out[102]:
a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0

In [103]:
obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])

In [104]:
obj3


Out[104]:
0      blue
2    purple
4    yellow

In [105]:
obj3.reindex(range(6), method='ffill')


Out[105]:
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow

In [106]:
frame = DataFrame(np.arange(9).reshape((3,3)), index=['a','b', 'd'],
                  columns=['Ohio', 'Texas', 'California'])

In [107]:
frame


Out[107]:
Ohio Texas California
a 0 1 2
b 3 4 5
d 6 7 8

In [108]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])

In [109]:
frame2


Out[109]:
Ohio Texas California
a 0 1 2
b 3 4 5
c NaN NaN NaN
d 6 7 8

In [110]:
states = ['Texas', 'Utah', 'California']

In [111]:
frame.reindex(columns=states)


Out[111]:
Texas Utah California
a 1 NaN 2
b 4 NaN 5
d 7 NaN 8

In [112]:
frame.reindex(index=['a', 'b', 'c', 'd'],method='ffill',
              columns = states)


Out[112]:
Texas Utah California
a 1 NaN 2
b 4 NaN 5
c 4 NaN 5
d 7 NaN 8

In [113]:
frame.ix[['a', 'b', 'c', 'd'], states]


Out[113]:
Texas Utah California
a 1 NaN 2
b 4 NaN 5
c NaN NaN NaN
d 7 NaN 8

In [114]:
#丢弃指定轴上的项

In [115]:
obj = Series(np.arange(5), index=['a', 'b','c', 'd', 'e'])

In [116]:
obj


Out[116]:
a    0
b    1
c    2
d    3
e    4

In [117]:
new_obj = obj.drop('c')

In [118]:
new_obj


Out[118]:
a    0
b    1
d    3
e    4

In [119]:
data = DataFrame(np.arange(16).reshape(4,4),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])

In [120]:
data


Out[120]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15

In [121]:
data.drop(['Colorado', 'Ohio'])


Out[121]:
one two three four
Utah 8 9 10 11
New York 12 13 14 15

In [122]:
data.drop('two', axis=1)


Out[122]:
one three four
Ohio 0 2 3
Colorado 4 6 7
Utah 8 10 11
New York 12 14 15

In [123]:
data.drop(['two', 'four'], axis=1)


Out[123]:
one three
Ohio 0 2
Colorado 4 6
Utah 8 10
New York 12 14

In [127]:
#索引、选取和过滤

In [130]:
data


Out[130]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15

In [ ]: