In [6]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

In [2]:
obj = Series([1,2,3])
obj.values


Out[2]:
array([1, 2, 3])

In [3]:
obj.index


Out[3]:
Int64Index([0, 1, 2], dtype=int64)

In [4]:
obj = Series([4,5,6], index=list("abc"))
obj


Out[4]:
a    4
b    5
c    6
dtype: int64

In [5]:
obj[['a','b']]


Out[5]:
a    4
b    5
dtype: int64

In [7]:
np.exp(obj)


Out[7]:
a     54.598150
b    148.413159
c    403.428793
dtype: float64

In [9]:
st=list("abe")
x2 = Series(obj, index=st)
x2


Out[9]:
a     4
b     5
e   NaN
dtype: float64

In [10]:
x2.isnull()


Out[10]:
a    False
b    False
e     True
dtype: bool

In [11]:
x2[x2.notnull()]


Out[11]:
a    4
b    5
dtype: float64

In [12]:
obj.name = 'test'
obj


Out[12]:
a    4
b    5
c    6
Name: test, dtype: int64

In [13]:
obj.index = ['g','h','i']

In [15]:
# Data Frames

In [16]:
data = {'st':['ct','ny','ca'],
        'year':[2008,2009,2010], 'pop':[1.5, 1.2, 1.8]}
data


Out[16]:
{'pop': [1.5, 1.2, 1.8], 'st': ['ct', 'ny', 'ca'], 'year': [2008, 2009, 2010]}

In [17]:
df = DataFrame(data)
df


Out[17]:
pop st year
0 1.5 ct 2008
1 1.2 ny 2009
2 1.8 ca 2010

In [18]:
# Re-arrange columns
DataFrame(data, columns=['year','st','pop'])


Out[18]:
year st pop
0 2008 ct 1.5
1 2009 ny 1.2
2 2010 ca 1.8

In [24]:
# Re-arrange columns + Re-index
df = DataFrame(data, columns=['year','st','pop'],
          index = ['a','b','c'])

In [25]:
df.year


Out[25]:
a    2008
b    2009
c    2010
Name: year, dtype: int64

In [26]:
df['year']


Out[26]:
a    2008
b    2009
c    2010
Name: year, dtype: int64

In [27]:
# Accessing Rows with ix
df.ix[0]


Out[27]:
year    2008
st        ct
pop      1.5
Name: a, dtype: object

In [28]:
df.ix['a']


Out[28]:
year    2008
st        ct
pop      1.5
Name: a, dtype: object

In [29]:
df['extra'] = 100
df


Out[29]:
year st pop extra
a 2008 ct 1.5 100
b 2009 ny 1.2 100
c 2010 ca 1.8 100

In [31]:
df['extra'] = np.arange(3)
df


Out[31]:
year st pop extra
a 2008 ct 1.5 0
b 2009 ny 1.2 1
c 2010 ca 1.8 2

In [33]:
df['isct'] = df['st'] == "ct"
df


Out[33]:
year st pop extra isct
a 2008 ct 1.5 0 True
b 2009 ny 1.2 1 False
c 2010 ca 1.8 2 False

In [37]:
DataFrame(df, index=['a','b','e'])


Out[37]:
year st pop extra isct
a 2008 ct 1.5 0 True
b 2009 ny 1.2 1 False
e NaN NaN NaN NaN NaN

In [38]:
df.index.name = "alpha"
df.columns.name = "info"
df


Out[38]:
info year st pop extra isct
alpha
a 2008 ct 1.5 0 True
b 2009 ny 1.2 1 False
c 2010 ca 1.8 2 False

In [39]:
df.values


Out[39]:
array([[2008, 'ct', 1.5, 0, True],
       [2009, 'ny', 1.2, 1, False],
       [2010, 'ca', 1.8, 2, False]], dtype=object)

In [40]:
df.index


Out[40]:
Index([u'a', u'b', u'c'], dtype=object)

In [41]:
df.index[0]


Out[41]:
'a'

In [42]:
df.index[0] = "new" # Index objects are immutable


---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-42-b411703d64b6> in <module>()
----> 1 df.index[0] = "new" # Index objects are immutable

/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in __setitem__(self, key, value)
    328 
    329     def __setitem__(self, key, value):
--> 330         raise Exception(str(self.__class__) + ' object is immutable')
    331 
    332     def __getitem__(self, key):

Exception: <class 'pandas.core.index.Index'> object is immutable

In [43]:
df.columns


Out[43]:
Index([u'year', u'st', u'pop', u'extra', u'isct'], dtype=object)

In [44]:
'pop' in df.columns


Out[44]:
True

In [45]:
df['pop']


Out[45]:
alpha
a        1.5
b        1.2
c        1.8
Name: pop, dtype: float64

In [47]:
df[df.columns['pop' in df.columns]]


Out[47]:
alpha
a        ct
b        ny
c        ca
Name: st, dtype: object

In [48]:
df.index


Out[48]:
Index([u'a', u'b', u'c'], dtype=object)

In [49]:
# Re-arrange rows
df.reindex(['c','b','a'])


Out[49]:
info year st pop extra isct
c 2010 ca 1.8 2 False
b 2009 ny 1.2 1 False
a 2008 ct 1.5 0 True

In [50]:
df.reindex(['c','b','a','e'], fill_value=0)


Out[50]:
info year st pop extra isct
c 2010 ca 1.8 2 False
b 2009 ny 1.2 1 False
a 2008 ct 1.5 0 True
e 0 0 0.0 0 0

In [51]:
obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3.reindex(range(6), method="ffill") # Forward Fill Missing


Out[51]:
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [61]:
df.drop('c')


Out[61]:
info year st pop extra isct
alpha
a 2008 ct 1.5 0 True
b 2009 ny 1.2 1 False

In [62]:
df.drop('isct', axis=1)


Out[62]:
info year st pop extra
alpha
a 2008 ct 1.5 0
b 2009 ny 1.2 1
c 2010 ca 1.8 2

In [64]:
# Slicing with labels
# The endpoint is included
df['a':'b']


Out[64]:
info year st pop extra isct
alpha
a 2008 ct 1.5 0 True
b 2009 ny 1.2 1 False

In [65]:
df[['year','st']]


Out[65]:
info year st
alpha
a 2008 ct
b 2009 ny
c 2010 ca

In [66]:
# Indexing by Rows + Columns
df.ix['a':'b',['year','st','pop']]


Out[66]:
year st pop
alpha
a 2008 ct 1.5
b 2009 ny 1.2

In [68]:
# In operations, values that do not overlap are marked with NaN
a1 = pd.DataFrame([1,2,3], index=['a','b','c'])
a2 = pd.DataFrame([10,20,30], index=['a','b','d'])
a1 + a2


Out[68]:
0
a 11
b 22
c NaN
d NaN

In [69]:
# Applying a function on 1D Arrays to each column
obj = DataFrame(np.random.randn(4, 3), columns=list('bde'),
                index=['Utah', 'Ohio', 'Texas', 'Oregon'])

f = lambda x: x.max() - x.min()
obj


Out[69]:
b d e
Utah 1.506451 0.940977 -0.161616
Ohio 0.903284 0.537459 -0.526920
Texas 0.238575 -1.098292 0.244920
Oregon -1.982642 -0.445283 0.506898

In [70]:
# Similar to R apply
obj.apply(f)


Out[70]:
b    3.489094
d    2.039268
e    1.033818
dtype: float64

In [71]:
format = lambda x: '%.2f' %x
obj.applymap(format)


Out[71]:
b d e
Utah 1.51 0.94 -0.16
Ohio 0.90 0.54 -0.53
Texas 0.24 -1.10 0.24
Oregon -1.98 -0.45 0.51

In [72]:
obj.sort_index()


Out[72]:
b d e
Ohio 0.903284 0.537459 -0.526920
Oregon -1.982642 -0.445283 0.506898
Texas 0.238575 -1.098292 0.244920
Utah 1.506451 0.940977 -0.161616

In [74]:
obj.sort_index(axis=1, ascending=False)


Out[74]:
e d b
Utah -0.161616 0.940977 1.506451
Ohio -0.526920 0.537459 0.903284
Texas 0.244920 -1.098292 0.238575
Oregon 0.506898 -0.445283 -1.982642

In [76]:
# To sort by values, use order
obj['e'].order()


Out[76]:
Ohio     -0.526920
Utah     -0.161616
Texas     0.244920
Oregon    0.506898
Name: e, dtype: float64

In [78]:
# To sort a DataFrame by values, use order
obj.sort_index(by=['e','d'])


Out[78]:
b d e
Ohio 0.903284 0.537459 -0.526920
Utah 1.506451 0.940977 -0.161616
Texas 0.238575 -1.098292 0.244920
Oregon -1.982642 -0.445283 0.506898

In [81]:
obj['e'].rank(method="first") # Rank with tie break


Out[81]:
Utah      2
Ohio      1
Texas     3
Oregon    4
Name: e, dtype: float64

In [84]:
ser = Series([1,1,2,10,3,5], index=list('aabcde'))
ser.index.is_unique


Out[84]:
False

In [85]:
obj


Out[85]:
b d e
Utah 1.506451 0.940977 -0.161616
Ohio 0.903284 0.537459 -0.526920
Texas 0.238575 -1.098292 0.244920
Oregon -1.982642 -0.445283 0.506898

In [86]:
obj.sum()


Out[86]:
b    0.665667
d   -0.065139
e    0.063281
dtype: float64

In [88]:
obj.sum(axis=1, skipna=True)


Out[88]:
Utah      2.285812
Ohio      0.913822
Texas    -0.614797
Oregon   -1.921028
dtype: float64

In [89]:
obj.describe()


Out[89]:
b d e
count 4.000000 4.000000 4.000000
mean 0.166417 -0.016285 0.015820
std 1.523409 0.926959 0.454494
min -1.982642 -1.098292 -0.526920
25% -0.316730 -0.608535 -0.252942
50% 0.570929 0.046088 0.041652
75% 1.054076 0.638338 0.310414
max 1.506451 0.940977 0.506898

In [91]:
obj.unstack()


Out[91]:
b  Utah      1.506451
   Ohio      0.903284
   Texas     0.238575
   Oregon   -1.982642
d  Utah      0.940977
   Ohio      0.537459
   Texas    -1.098292
   Oregon   -0.445283
e  Utah     -0.161616
   Ohio     -0.526920
   Texas     0.244920
   Oregon    0.506898
dtype: float64

In [94]:
print ser
print '----'
ser.value_counts()


a     1
a     1
b     2
c    10
d     3
e     5
dtype: int64
----
Out[94]:
1     2
5     1
3     1
10    1
2     1
dtype: int64

In [95]:
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data


Out[95]:
0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [96]:
data.dropna()


Out[96]:
0    1.0
2    3.5
4    7.0
dtype: float64

In [98]:
# But drop NA drops any row that has NA
# With DataFrames, the requirement might be differenet

data = DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
data


Out[98]:
0 1 2
0 1 6.5 3
1 1 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3

In [100]:
# Drop only if all values in the row is an NA
data.dropna(axis=0, how='all')


Out[100]:
0 1 2
0 1 6.5 3
1 1 NaN NaN
3 NaN 6.5 3

In [101]:
data.fillna(0)


Out[101]:
0 1 2
0 1 6.5 3
1 1 0.0 0
2 0 0.0 0
3 0 6.5 3

In [103]:
# However, fillna returns a new object
# You can modify in-place using _

_ = data.fillna(0, inplace=True)
data


Out[103]:
0 1 2
0 1 6.5 3
1 1 0.0 0
2 0 0.0 0
3 0 6.5 3

In [104]:
# Hierarchical Indexing
data = Series(np.random.randn(10), 
              index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
                     [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
data


Out[104]:
a  1    0.239378
   2   -0.895084
   3   -0.266205
b  1    1.244437
   2   -0.091399
   3    1.618462
c  1    0.770874
   2    2.563309
d  2   -0.420505
   3   -1.692568
dtype: float64

In [105]:
# Shows that data has a multi index
data.index


Out[105]:
MultiIndex
[(u'a', 1), (u'a', 2), (u'a', 3), (u'b', 1), (u'b', 2), (u'b', 3), (u'c', 1), (u'c', 2), (u'd', 2), (u'd', 3)]

In [107]:
# Shows all values of a the outer index
data['a',]


Out[107]:
1    0.239378
2   -0.895084
3   -0.266205
dtype: float64

In [109]:
# Shows the values as 1 for each of a, b and c outer index
data[:,1]


Out[109]:
a    0.239378
b    1.244437
c    0.770874
dtype: float64

In [110]:
data.unstack()


Out[110]:
1 2 3
a 0.239378 -0.895084 -0.266205
b 1.244437 -0.091399 1.618462
c 0.770874 2.563309 NaN
d NaN -0.420505 -1.692568

In [111]:
# Hierarchical Indexing in Data Frames
df = DataFrame(np.arange(12).reshape((4, 3)), index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
df


Out[111]:
Ohio Colorado
Green Red Green
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11

In [112]:
# The hierarchical levels can have names (as strings or 
# any Python objects). If so, these will show up in the console output (don’t confuse the index names with the axis labels!):

df.index.names = ['key1','key2']
df.columns.names = ['state', 'colour']
df


Out[112]:
state Ohio Colorado
colour Green Red Green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11

In [113]:
# Changing the order of the levels
df.sortlevel(1)


Out[113]:
state Ohio Colorado
colour Green Red Green
key1 key2
a 1 0 1 2
b 1 6 7 8
a 2 3 4 5
b 2 9 10 11

In [114]:
df.swaplevel('key1','key2')


Out[114]:
state Ohio Colorado
colour Green Red Green
key2 key1
1 a 0 1 2
2 a 3 4 5
1 b 6 7 8
2 b 9 10 11

In [115]:
df.sum(level='key2')


Out[115]:
state Ohio Colorado
colour Green Red Green
key2
1 6 8 10
2 12 14 16

In [116]:
df.sum(level='colour', axis = 1)


Out[116]:
colour Green Red
key1 key2
a 1 2 1
2 8 4
b 1 14 7
2 20 10

In [118]:
obj


Out[118]:
b d e
Utah 1.506451 0.940977 -0.161616
Ohio 0.903284 0.537459 -0.526920
Texas 0.238575 -1.098292 0.244920
Oregon -1.982642 -0.445283 0.506898

In [119]:
# Setting an index from a column name
obj.set_index('b')


Out[119]:
d e
b
1.506451 0.940977 -0.161616
0.903284 0.537459 -0.526920
0.238575 -1.098292 0.244920
-1.982642 -0.445283 0.506898

In [120]:
obj.set_index('b', drop=False)


Out[120]:
b d e
b
1.506451 1.506451 0.940977 -0.161616
0.903284 0.903284 0.537459 -0.526920
0.238575 0.238575 -1.098292 0.244920
-1.982642 -1.982642 -0.445283 0.506898

In [121]:
obj


Out[121]:
b d e
Utah 1.506451 0.940977 -0.161616
Ohio 0.903284 0.537459 -0.526920
Texas 0.238575 -1.098292 0.244920
Oregon -1.982642 -0.445283 0.506898

In [122]:
_ = obj.set_index('b', inplace=True)

In [123]:
obj


Out[123]:
d e
b
1.506451 0.940977 -0.161616
0.903284 0.537459 -0.526920
0.238575 -1.098292 0.244920
-1.982642 -0.445283 0.506898

In [124]:


In [ ]: