In [38]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
obj = Series([4, 8, -5, 2])

In [3]:
obj


Out[3]:
0    4
1    8
2   -5
3    2
dtype: int64

In [4]:
obj2 = Series([1, 2, -4, 8], index=['a', 'c', 'd', 'b'])

In [5]:
obj2


Out[5]:
a    1
c    2
d   -4
b    8
dtype: int64

In [6]:
obj.values


Out[6]:
array([ 4,  8, -5,  2])

In [10]:
type(obj2.values)


Out[10]:
numpy.ndarray

In [12]:
obj2.index


Out[12]:
Index(['a', 'c', 'd', 'b'], dtype='object')

In [13]:
type(obj2.index)


Out[13]:
pandas.core.indexes.base.Index

In [14]:
obj2['a']


Out[14]:
1

In [15]:
obj2['d'] = 4
obj2


Out[15]:
a    1
c    2
d    4
b    8
dtype: int64

In [16]:
obj2[obj2 > 2]


Out[16]:
d    4
b    8
dtype: int64

In [17]:
'b' in obj2


Out[17]:
True

In [18]:
sdata = {'Ohio': 1234, 'Texas': 789}
obj3 = pd.Series(sdata)
obj3


Out[18]:
Ohio     1234
Texas     789
dtype: int64

In [19]:
obj4 = pd.Series(sdata, index=['Ohio', 'California', 'Texas'])

In [20]:
obj4


Out[20]:
Ohio          1234.0
California       NaN
Texas          789.0
dtype: float64

In [21]:
pd.isnull(obj4)


Out[21]:
Ohio          False
California     True
Texas         False
dtype: bool

In [22]:
obj4.isnull()


Out[22]:
Ohio          False
California     True
Texas         False
dtype: bool

In [24]:
print(obj4.name)


None

In [25]:
obj4.name = 'pop'

In [26]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.8, 3.6, 2.4, 2.9, 3.2]}

In [27]:
frame = pd.DataFrame(data)

In [28]:
frame


Out[28]:
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.8
2 Ohio 2002 3.6
3 Nevada 2001 2.4
4 Nevada 2002 2.9
5 Nevada 2003 3.2

In [29]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])


Out[29]:
year state pop
0 2000 Ohio 1.5
1 2001 Ohio 1.8
2 2002 Ohio 3.6
3 2001 Nevada 2.4
4 2002 Nevada 2.9
5 2003 Nevada 3.2

In [30]:
frame.columns


Out[30]:
Index(['state', 'year', 'pop'], dtype='object')

In [32]:
frame['year']


Out[32]:
0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [33]:
frame.year


Out[33]:
0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [35]:
frame.loc[1]


Out[35]:
state    Ohio
year     2001
pop       1.8
Name: 1, dtype: object

In [36]:
frame['debt'] = 17.5

In [37]:
frame


Out[37]:
state year pop debt
0 Ohio 2000 1.5 17.5
1 Ohio 2001 1.8 17.5
2 Ohio 2002 3.6 17.5
3 Nevada 2001 2.4 17.5
4 Nevada 2002 2.9 17.5
5 Nevada 2003 3.2 17.5

In [40]:
frame['debt'] = np.arange(6.)

In [41]:
frame


Out[41]:
state year pop debt
0 Ohio 2000 1.5 0.0
1 Ohio 2001 1.8 1.0
2 Ohio 2002 3.6 2.0
3 Nevada 2001 2.4 3.0
4 Nevada 2002 2.9 4.0
5 Nevada 2003 3.2 5.0

In [42]:
val = pd.Series([-1.2, -1.5, -1.7], index=[0, 1, 4])

In [43]:
frame['debt'] = val
frame


Out[43]:
state year pop debt
0 Ohio 2000 1.5 -1.2
1 Ohio 2001 1.8 -1.5
2 Ohio 2002 3.6 NaN
3 Nevada 2001 2.4 NaN
4 Nevada 2002 2.9 -1.7
5 Nevada 2003 3.2 NaN

In [44]:
frame['eastern'] = frame.state == 'Ohio'

In [45]:
frame


Out[45]:
state year pop debt eastern
0 Ohio 2000 1.5 -1.2 True
1 Ohio 2001 1.8 -1.5 True
2 Ohio 2002 3.6 NaN True
3 Nevada 2001 2.4 NaN False
4 Nevada 2002 2.9 -1.7 False
5 Nevada 2003 3.2 NaN False

In [46]:
del frame['eastern']

In [47]:
frame


Out[47]:
state year pop debt
0 Ohio 2000 1.5 -1.2
1 Ohio 2001 1.8 -1.5
2 Ohio 2002 3.6 NaN
3 Nevada 2001 2.4 NaN
4 Nevada 2002 2.9 -1.7
5 Nevada 2003 3.2 NaN

In [48]:
frame.columns


Out[48]:
Index(['state', 'year', 'pop', 'debt'], dtype='object')

In [49]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [50]:
frame3 = pd.DataFrame(pop)

In [51]:
frame3


Out[51]:
Nevada Ohio
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6

In [52]:
frame3.T


Out[52]:
2000 2001 2002
Nevada NaN 2.4 2.9
Ohio 1.5 1.7 3.6

In [53]:
type(frame3['Ohio'][:-1])


Out[53]:
pandas.core.series.Series

In [55]:
frame3['Ohio'][:-1]


Out[55]:
2000    1.5
2001    1.7
Name: Ohio, dtype: float64

In [56]:
frame3.values


Out[56]:
array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])

In [57]:
res = frame3.values

In [58]:
res.shape


Out[58]:
(3, 2)

In [59]:
type(res.shape)


Out[59]:
tuple

In [60]:
frame = pd.DataFrame(np.arange(9.).reshape((3,3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])

In [61]:
frame


Out[61]:
Ohio Texas California
a 0.0 1.0 2.0
c 3.0 4.0 5.0
d 6.0 7.0 8.0

In [63]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])

In [64]:
frame2


Out[64]:
Ohio Texas California
a 0.0 1.0 2.0
b NaN NaN NaN
c 3.0 4.0 5.0
d 6.0 7.0 8.0

In [65]:
frame3 = frame2.reindex(columns = ['Ohio', 'Texas', 'Georgia'])

In [66]:
frame3


Out[66]:
Ohio Texas Georgia
a 0.0 1.0 NaN
b NaN NaN NaN
c 3.0 4.0 NaN
d 6.0 7.0 NaN

In [67]:
data = pd.DataFrame(np.arange(16).reshape((4,4)),
                     index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

In [68]:
data


Out[68]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15

In [69]:
data.loc['Colorado', ['two', 'three']]


Out[69]:
two      5
three    6
Name: Colorado, dtype: int64

In [70]:
type(data.loc['Colorado', ['two', 'three']])


Out[70]:
pandas.core.series.Series

In [71]:
data.iloc[2]


Out[71]:
one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [72]:
data.iloc[:, :3]


Out[72]:
one two three
Ohio 0 1 2
Colorado 4 5 6
Utah 8 9 10
New York 12 13 14

In [73]:
data.iloc[:, :3][data.three > 5]


Out[73]:
one two three
Colorado 4 5 6
Utah 8 9 10
New York 12 13 14

In [79]:
data.at['Colorado', 'three']


Out[79]:
6

In [80]:
frame = pd.DataFrame(np.random.randn(4,3), columns = list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [81]:
frame


Out[81]:
b d e
Utah 0.990554 0.811534 0.540745
Ohio 1.099677 0.143110 0.468141
Texas -1.178280 1.164022 -0.356268
Oregon 0.527642 -0.804912 -1.620197

In [82]:
np.abs(frame)


Out[82]:
b d e
Utah 0.990554 0.811534 0.540745
Ohio 1.099677 0.143110 0.468141
Texas 1.178280 1.164022 0.356268
Oregon 0.527642 0.804912 1.620197

In [83]:
f = lambda x: x.max() - x.min()

In [84]:
frame.apply(f)


Out[84]:
b    2.277957
d    1.968935
e    2.160942
dtype: float64

In [86]:
frame.apply(f, axis='columns')


Out[86]:
Utah      0.449809
Ohio      0.956567
Texas     2.342302
Oregon    2.147838
dtype: float64

In [87]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

In [88]:
frame.apply(f)


Out[88]:
b d e
min -1.178280 -0.804912 -1.620197
max 1.099677 1.164022 0.540745

In [89]:
format = lambda x: '%.2f' % x

In [90]:
frame.applymap(format)


Out[90]:
b d e
Utah 0.99 0.81 0.54
Ohio 1.10 0.14 0.47
Texas -1.18 1.16 -0.36
Oregon 0.53 -0.80 -1.62

In [91]:
frame['e']


Out[91]:
Utah      0.540745
Ohio      0.468141
Texas    -0.356268
Oregon   -1.620197
Name: e, dtype: float64

In [92]:
frame['e'].map(format)


Out[92]:
Utah       0.54
Ohio       0.47
Texas     -0.36
Oregon    -1.62
Name: e, dtype: object

In [93]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])

In [94]:
obj


Out[94]:
d    0
a    1
b    2
c    3
dtype: int64

In [95]:
obj.sort_index()


Out[95]:
a    1
b    2
c    3
d    0
dtype: int64

In [96]:
frame


Out[96]:
b d e
Utah 0.990554 0.811534 0.540745
Ohio 1.099677 0.143110 0.468141
Texas -1.178280 1.164022 -0.356268
Oregon 0.527642 -0.804912 -1.620197

In [97]:
frame.sort_index()


Out[97]:
b d e
Ohio 1.099677 0.143110 0.468141
Oregon 0.527642 -0.804912 -1.620197
Texas -1.178280 1.164022 -0.356268
Utah 0.990554 0.811534 0.540745

In [98]:
frame.sort_index(axis=0)


Out[98]:
b d e
Ohio 1.099677 0.143110 0.468141
Oregon 0.527642 -0.804912 -1.620197
Texas -1.178280 1.164022 -0.356268
Utah 0.990554 0.811534 0.540745

In [101]:
frame.sort_index(axis='columns', ascending=False)


Out[101]:
e d b
Utah 0.540745 0.811534 0.990554
Ohio 0.468141 0.143110 1.099677
Texas -0.356268 1.164022 -1.178280
Oregon -1.620197 -0.804912 0.527642

In [102]:
frame


Out[102]:
b d e
Utah 0.990554 0.811534 0.540745
Ohio 1.099677 0.143110 0.468141
Texas -1.178280 1.164022 -0.356268
Oregon 0.527642 -0.804912 -1.620197

In [103]:
frame.sort_values(by='e')


Out[103]:
b d e
Oregon 0.527642 -0.804912 -1.620197
Texas -1.178280 1.164022 -0.356268
Ohio 1.099677 0.143110 0.468141
Utah 0.990554 0.811534 0.540745

In [104]:
frame.sort_values(by=['e', 'd'])


Out[104]:
b d e
Oregon 0.527642 -0.804912 -1.620197
Texas -1.178280 1.164022 -0.356268
Ohio 1.099677 0.143110 0.468141
Utah 0.990554 0.811534 0.540745

In [105]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two'])

In [106]:
df


Out[106]:
one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3

In [107]:
df.sum()


Out[107]:
one    9.25
two   -5.80
dtype: float64

In [108]:
df.sum(axis=1)


Out[108]:
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [109]:
df.T


Out[109]:
a b c d
one 1.4 7.1 NaN 0.75
two NaN -4.5 NaN -1.30

In [110]:
df.T.sum(axis=1)


Out[110]:
one    9.25
two   -5.80
dtype: float64

In [111]:
df.mean()


Out[111]:
one    3.083333
two   -2.900000
dtype: float64

In [112]:
df


Out[112]:
one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3

In [113]:
hmm = (1.40+7.10+0.75)/3

In [114]:
hmm


Out[114]:
3.0833333333333335

In [115]:
df.mean(skipna=False)


Out[115]:
one   NaN
two   NaN
dtype: float64

In [116]:
df.drop('c')


Out[116]:
one two
a 1.40 NaN
b 7.10 -4.5
d 0.75 -1.3

In [119]:
df.drop('c').mean(skipna=False)


Out[119]:
one    3.083333
two         NaN
dtype: float64

In [120]:
df


Out[120]:
one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3

In [121]:
df.fillna(0)


Out[121]:
one two
a 1.40 0.0
b 7.10 -4.5
c 0.00 0.0
d 0.75 -1.3

In [122]:
df.fillna(0).mean(skipna=False)


Out[122]:
one    2.3125
two   -1.4500
dtype: float64

In [123]:
df.cumsum()


Out[123]:
one two
a 1.40 NaN
b 8.50 -4.5
c NaN NaN
d 9.25 -5.8

In [124]:
df['three'] = df['one'].cumsum()

In [125]:
df


Out[125]:
one two three
a 1.40 NaN 1.40
b 7.10 -4.5 8.50
c NaN NaN NaN
d 0.75 -1.3 9.25

In [126]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})

In [127]:
data


Out[127]:
Qu1 Qu2 Qu3
0 1 2 1
1 3 3 5
2 4 1 2
3 3 2 4
4 4 3 4

In [129]:
data['Qu1'].value_counts()


Out[129]:
pandas.core.series.Series

In [137]:
result = data.apply(pd.value_counts)

In [138]:
result


Out[138]:
Qu1 Qu2 Qu3
1 1.0 1.0 1.0
2 NaN 2.0 1.0
3 2.0 2.0 NaN
4 2.0 NaN 2.0
5 NaN NaN 1.0

In [ ]: