In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data


Out[3]:
0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [4]:
# get np array
data.values


Out[4]:
array([ 0.25,  0.5 ,  0.75,  1.  ])

In [5]:
data.index


Out[5]:
RangeIndex(start=0, stop=4, step=1)

In [6]:
data[1]


Out[6]:
0.5

In [7]:
data[1:3]


Out[7]:
1    0.50
2    0.75
dtype: float64

In [8]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
data


Out[8]:
a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [9]:
data['b']


Out[9]:
0.5

In [10]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[2, 5, 3, 7])
data


Out[10]:
2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

In [11]:
data[5]


Out[11]:
0.5

In [12]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population


Out[12]:
California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
dtype: int64

In [13]:
population['California']


Out[13]:
38332521

In [14]:
population['California':'Illinois']  # inclusive


Out[14]:
California    38332521
Florida       19552860
Illinois      12882135
dtype: int64

In [15]:
pd.Series(5, index=[100, 200, 300])


Out[15]:
100    5
200    5
300    5
dtype: int64

In [16]:
pd.Series({2:'a', 1:'b', 3:'c'})


Out[16]:
1    b
2    a
3    c
dtype: object

In [17]:
pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2])


Out[17]:
3    c
2    a
dtype: object

In [18]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area


Out[18]:
California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
dtype: int64

In [19]:
states = pd.DataFrame({'population': population,
                       'area': area})
states


Out[19]:
area population
California 423967 38332521
Florida 170312 19552860
Illinois 149995 12882135
New York 141297 19651127
Texas 695662 26448193

In [20]:
states.index


Out[20]:
Index([u'California', u'Florida', u'Illinois', u'New York', u'Texas'], dtype='object')

In [21]:
states.columns


Out[21]:
Index([u'area', u'population'], dtype='object')

In [22]:
states['area']


Out[22]:
California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [23]:
pd.DataFrame(population, columns=['population'])


Out[23]:
population
California 38332521
Florida 19552860
Illinois 12882135
New York 19651127
Texas 26448193

In [24]:
data = [{'a': i, 'b': 2 * i} for i in range(3)]
pd.DataFrame(data)


Out[24]:
a b
0 0 0
1 1 2
2 2 4

In [25]:
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])


Out[25]:
a b c
0 1.0 2 NaN
1 NaN 3 4.0

In [26]:
pd.DataFrame({'population': population, 'area': area})


Out[26]:
area population
California 423967 38332521
Florida 170312 19552860
Illinois 149995 12882135
New York 141297 19651127
Texas 695662 26448193

In [27]:
pd.DataFrame(np.random.rand(3, 2),
             columns=['foo', 'bar'],
             index=['a', 'b', 'c'])


Out[27]:
foo bar
a 0.456953 0.866265
b 0.237729 0.044402
c 0.783007 0.848167

In [28]:
pd.DataFrame(np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')]))


Out[28]:
A B
0 0 0.0
1 0 0.0
2 0 0.0

In [29]:
ind = pd.Index([2, 3, 5, 7, 11])
print ind
print ind[1]
print ind[::2]
# cannot ind[1] = 0, ind is immutable
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])
print indA & indB
print indA | indB
print indA ^ indB


Int64Index([2, 3, 5, 7, 11], dtype='int64')
3
Int64Index([2, 5, 11], dtype='int64')
Int64Index([3, 5, 7], dtype='int64')
Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')
Int64Index([1, 2, 9, 11], dtype='int64')

In [30]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print data['b']
print 'a' in data
print data.keys()
print list(data.iteritems())
data['e'] = 1.25
print data


0.5
True
Index([u'a', u'b', u'c', u'd'], dtype='object')
[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]
a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [31]:
print data['a':'c']
print data[0:2]
print data[(data > 0.3) & (data < 0.8)]
print data[['a', 'e']]


a    0.25
b    0.50
c    0.75
dtype: float64
a    0.25
b    0.50
dtype: float64
b    0.50
c    0.75
dtype: float64
a    0.25
e    1.25
dtype: float64

In [32]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])

# loc attribute allows indexing and slicing that always references the explicit index
print data.loc[1]
print data.loc[1:3]
print
# iloc attribute allows indexing and slicing that always references the implicit Python-style index
print data.iloc[1]
print data.iloc[1:3]


a
1    a
3    b
dtype: object

b
3    b
5    c
dtype: object

In [33]:
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data


Out[33]:
area pop
California 423967 38332521
Florida 170312 19552860
Illinois 149995 12882135
New York 141297 19651127
Texas 695662 26448193

In [34]:
print data['area']
print data.area
print data['area'] is data.area
print data['pop'] is data.pop  # DataFrame has a pop() method, so data.pop will point to this method


California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64
California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64
True
False

In [35]:
data['density'] = data['pop'] / data['area']
data


Out[35]:
area pop density
California 423967 38332521 90.413926
Florida 170312 19552860 114.806121
Illinois 149995 12882135 85.883763
New York 141297 19651127 139.076746
Texas 695662 26448193 38.018740

In [36]:
data.values


Out[36]:
array([[  4.23967000e+05,   3.83325210e+07,   9.04139261e+01],
       [  1.70312000e+05,   1.95528600e+07,   1.14806121e+02],
       [  1.49995000e+05,   1.28821350e+07,   8.58837628e+01],
       [  1.41297000e+05,   1.96511270e+07,   1.39076746e+02],
       [  6.95662000e+05,   2.64481930e+07,   3.80187404e+01]])

In [37]:
data.T


Out[37]:
California Florida Illinois New York Texas
area 4.239670e+05 1.703120e+05 1.499950e+05 1.412970e+05 6.956620e+05
pop 3.833252e+07 1.955286e+07 1.288214e+07 1.965113e+07 2.644819e+07
density 9.041393e+01 1.148061e+02 8.588376e+01 1.390767e+02 3.801874e+01

In [38]:
data.values[0]


Out[38]:
array([  4.23967000e+05,   3.83325210e+07,   9.04139261e+01])

In [39]:
data['area']


Out[39]:
California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [40]:
data.iloc[:3, :2]


Out[40]:
area pop
California 423967 38332521
Florida 170312 19552860
Illinois 149995 12882135

In [41]:
data.loc[:'Illinois', :'pop']


Out[41]:
area pop
California 423967 38332521
Florida 170312 19552860
Illinois 149995 12882135

In [42]:
# The ix indexer allows a hybrid of these two approaches
data.ix[:3, :'pop']


Out[42]:
area pop
California 423967 38332521
Florida 170312 19552860
Illinois 149995 12882135

In [43]:
data.loc[data.density > 100, ['pop', 'density']]


Out[43]:
pop density
Florida 19552860 114.806121
New York 19651127 139.076746

In [44]:
print data['Florida':'Illinois']
print data[1:3]
print data[data.density > 100]


            area       pop     density
Florida   170312  19552860  114.806121
Illinois  149995  12882135   85.883763
            area       pop     density
Florida   170312  19552860  114.806121
Illinois  149995  12882135   85.883763
            area       pop     density
Florida   170312  19552860  114.806121
New York  141297  19651127  139.076746

In [45]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser


Out[45]:
0    6
1    3
2    7
3    4
dtype: int64

In [46]:
df = pd.DataFrame(rng.randint(0, 10, (3, 4)), columns=['A', 'B', 'C', 'D'])
df


Out[46]:
A B C D
0 6 9 2 6
1 7 4 3 7
2 7 2 5 4

In [47]:
np.exp(ser)


Out[47]:
0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [48]:
np.sin(df * np.pi / 4)


Out[48]:
A B C D
0 -1.000000 7.071068e-01 1.000000 -1.000000e+00
1 -0.707107 1.224647e-16 0.707107 -7.071068e-01
2 -0.707107 1.000000e+00 -0.707107 1.224647e-16

In [49]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')
population / area


Out[49]:
Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [50]:
area.index | population.index


Out[50]:
Index([u'Alaska', u'California', u'New York', u'Texas'], dtype='object')

In [51]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A+B


Out[51]:
0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [52]:
A.add(B, fill_value=0)


Out[52]:
0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [53]:
A = pd.DataFrame(rng.randint(0, 20, (2, 2)), columns=list('AB'))
A


Out[53]:
A B
0 1 11
1 5 1

In [54]:
B = pd.DataFrame(rng.randint(0, 10, (3, 3)), columns=list('BAC'))
B


Out[54]:
B A C
0 4 0 9
1 5 8 0
2 9 2 6

In [55]:
A + B


Out[55]:
A B C
0 1.0 15.0 NaN
1 13.0 6.0 NaN
2 NaN NaN NaN

In [56]:
A.stack()


Out[56]:
0  A     1
   B    11
1  A     5
   B     1
dtype: int64

In [57]:
A.stack().mean()


Out[57]:
4.5

In [58]:
A.add(B, fill_value=A.stack().mean())


Out[58]:
A B C
0 1.0 15.0 13.5
1 13.0 6.0 4.5
2 6.5 13.5 10.5

In [59]:
A = rng.randint(10, size=(3, 4))
df = pd.DataFrame(A, columns=list('QRST'))
df - df.iloc[0]


Out[59]:
Q R S T
0 0 0 0 0
1 -1 -2 2 4
2 3 -7 1 4

In [60]:
df.subtract(df['R'], axis=0)


Out[60]:
Q R S T
0 -5 0 -6 -4
1 -4 0 -2 2
2 5 0 2 7

In [61]:
halfrow = df.iloc[0, ::2]
halfrow


Out[61]:
Q    3
S    2
Name: 0, dtype: int64

In [62]:
df - halfrow


Out[62]:
Q R S T
0 0.0 NaN 0.0 NaN
1 -1.0 NaN 2.0 NaN
2 3.0 NaN 1.0 NaN

In [63]:
vals1 = np.array([1, None, 3, 4])
vals1


Out[63]:
array([1, None, 3, 4], dtype=object)

In [64]:
# This dtype=object means that the best common type representation NumPy could infer 
# for the contents of the array is that they are Python objects. While this kind of 
# object array is useful for some purposes, any operations on the data will be done 
# at the Python level, with much more overhead than the typically fast operations seen 
# for arrays with native types
for dtype in ['object', 'int']:
    print("dtype =", dtype)
    %timeit np.arange(1E6, dtype=dtype).sum()
    print()


('dtype =', 'object')
10 loops, best of 3: 32.8 ms per loop
()
('dtype =', 'int')
1000 loops, best of 3: 808 µs per loop
()

In [65]:
vals2 = np.array([1, np.nan, 3, 4])
vals2.dtype


Out[65]:
dtype('float64')

In [66]:
print 1 + np.nan
print 0 *  np.nan
print vals2.sum(), vals2.min(), vals2.max()


nan
nan
nan nan nan

In [67]:
print np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)


8.0 1.0 4.0

In [68]:
print pd.Series([1, np.nan, 2, None])
x = pd.Series(range(2), dtype=int)
print x
x[0] = None
print x


0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64
0    0
1    1
dtype: int64
0    NaN
1    1.0
dtype: float64

In [69]:
data = pd.Series([1, np.nan, 'hello', None])
print data.isnull()
print data.notnull()


0    False
1     True
2    False
3     True
dtype: bool
0     True
1    False
2     True
3    False
dtype: bool

In [70]:
data.dropna()


Out[70]:
0        1
2    hello
dtype: object

In [71]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])
df


Out[71]:
0 1 2
0 1.0 NaN 2
1 2.0 3.0 5
2 NaN 4.0 6

In [72]:
df.dropna()


Out[72]:
0 1 2
1 2.0 3.0 5

In [73]:
df.dropna(axis='columns')


Out[73]:
2
0 2
1 5
2 6

In [74]:
df[3] = np.nan
df


Out[74]:
0 1 2 3
0 1.0 NaN 2 NaN
1 2.0 3.0 5 NaN
2 NaN 4.0 6 NaN

In [75]:
df.dropna(axis='columns', how='all')


Out[75]:
0 1 2
0 1.0 NaN 2
1 2.0 3.0 5
2 NaN 4.0 6

In [76]:
df.dropna(axis='rows', thresh=3)  # rows with at least 3 non-nan vals


Out[76]:
0 1 2 3
1 2.0 3.0 5 NaN

In [77]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data


Out[77]:
a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [78]:
data.fillna(0)


Out[78]:
a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [79]:
# forward-fill
data.fillna(method='ffill')


Out[79]:
a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [80]:
# backward-fill
data.fillna(method='bfill')


Out[80]:
a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [81]:
df


Out[81]:
0 1 2 3
0 1.0 NaN 2 NaN
1 2.0 3.0 5 NaN
2 NaN 4.0 6 NaN

In [82]:
df.fillna(method='ffill', axis=1)


Out[82]:
0 1 2 3
0 1.0 1.0 2.0 2.0
1 2.0 3.0 5.0 5.0
2 NaN 4.0 6.0 6.0

In [83]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
pop = pd.Series(populations, index=index)
pop


Out[83]:
(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [84]:
pop[('California', 2010):('Texas', 2000)]


Out[84]:
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

In [85]:
pop[[i for i in pop.index if i[1] == 2010]]


Out[85]:
(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

In [86]:
index = pd.MultiIndex.from_tuples(index)
index


Out[86]:
MultiIndex(levels=[[u'California', u'New York', u'Texas'], [2000, 2010]],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

In [87]:
pop = pop.reindex(index)
pop


Out[87]:
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [88]:
pop[:, 2010]


Out[88]:
California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [89]:
pop_df = pop.unstack()
pop_df


Out[89]:
2000 2010
California 33871648 37253956
New York 18976457 19378102
Texas 20851820 25145561

In [90]:
pop_df.stack()


Out[90]:
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [91]:
pop_df = pd.DataFrame({'total': pop,
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})
pop_df


Out[91]:
total under18
California 2000 33871648 9267089
2010 37253956 9284094
New York 2000 18976457 4687374
2010 19378102 4318033
Texas 2000 20851820 5906301
2010 25145561 6879014

In [92]:
f_u18 = pop_df['under18'] / pop_df['total']
f_u18.unstack()


Out[92]:
2000 2010
California 0.273594 0.249211
New York 0.247010 0.222831
Texas 0.283251 0.273568

In [93]:
df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['data1', 'data2'])
df


Out[93]:
data1 data2
a 1 0.734581 0.739059
2 0.722357 0.891815
b 1 0.687610 0.685793
2 0.464258 0.567720

In [94]:
data = {('California', 2000): 33871648,
        ('California', 2010): 37253956,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561,
        ('New York', 2000): 18976457,
        ('New York', 2010): 19378102}
pd.Series(data)


Out[94]:
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [95]:
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])


Out[95]:
MultiIndex(levels=[[u'a', u'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [96]:
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])


Out[96]:
MultiIndex(levels=[[u'a', u'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [97]:
pd.MultiIndex.from_product([['a', 'b'], [1, 2]])


Out[97]:
MultiIndex(levels=[[u'a', u'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [98]:
pd.MultiIndex(levels=[['a', 'b'], [1, 2]],
              labels=[[0, 0, 1, 1], [0, 1, 0, 1]])


Out[98]:
MultiIndex(levels=[[u'a', u'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [99]:
pop.index.names = ['state', 'year']
pop


Out[99]:
state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [100]:
# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])
# mock some data
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
# create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data


Out[100]:
subject Bob Guido Sue
type HR Temp HR Temp HR Temp
year visit
2013 1 27.0 36.7 24.0 34.6 37.0 37.4
2 61.0 38.5 42.0 37.1 27.0 36.2
2014 1 32.0 37.1 38.0 37.6 35.0 38.1
2 37.0 36.3 19.0 37.1 38.0 36.6

In [101]:
health_data['Guido']


Out[101]:
type HR Temp
year visit
2013 1 24.0 34.6
2 42.0 37.1
2014 1 38.0 37.6
2 19.0 37.1

In [102]:
pop


Out[102]:
state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [103]:
pop['California', 2000]


Out[103]:
33871648

In [104]:
pop['California']


Out[104]:
year
2000    33871648
2010    37253956
dtype: int64

In [105]:
pop.loc['California':'New York']


Out[105]:
state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [106]:
pop[:, 2000]


Out[106]:
state
California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [107]:
pop[pop > 22000000]


Out[107]:
state       year
California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64

In [108]:
pop[['California', 'Texas']]


Out[108]:
state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64

In [109]:
health_data


Out[109]:
subject Bob Guido Sue
type HR Temp HR Temp HR Temp
year visit
2013 1 27.0 36.7 24.0 34.6 37.0 37.4
2 61.0 38.5 42.0 37.1 27.0 36.2
2014 1 32.0 37.1 38.0 37.6 35.0 38.1
2 37.0 36.3 19.0 37.1 38.0 36.6

In [113]:
health_data['Guido', 'HR']


Out[113]:
year  visit
2013  1        24.0
      2        42.0
2014  1        38.0
      2        19.0
Name: (Guido, HR), dtype: float64

In [111]:
health_data.iloc[:2, :2]


Out[111]:
subject Bob
type HR Temp
year visit
2013 1 27.0 36.7
2 61.0 38.5

In [114]:
health_data.loc[:, ('Bob', 'HR')]


Out[114]:
year  visit
2013  1        27.0
      2        61.0
2014  1        32.0
      2        37.0
Name: (Bob, HR), dtype: float64

In [115]:
idx = pd.IndexSlice
health_data.loc[idx[:, 1], idx[:, 'HR']]


Out[115]:
subject Bob Guido Sue
type HR HR HR
year visit
2013 1 27.0 24.0 37.0
2014 1 32.0 38.0 35.0

In [116]:
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data


Out[116]:
char  int
a     1      0.217673
      2      0.911464
c     1      0.453282
      2      0.098445
b     1      0.290294
      2      0.817984
dtype: float64

In [117]:
try: 
    data['a':'b']
except KeyError as e: 
    print(type(e))
    print(e)


<type 'exceptions.KeyError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'

In [118]:
data = data.sort_index()
data


Out[118]:
char  int
a     1      0.217673
      2      0.911464
b     1      0.290294
      2      0.817984
c     1      0.453282
      2      0.098445
dtype: float64

In [119]:
data['a':'b']


Out[119]:
char  int
a     1      0.217673
      2      0.911464
b     1      0.290294
      2      0.817984
dtype: float64

In [120]:
pop.unstack(level=0)


Out[120]:
state California New York Texas
year
2000 33871648 18976457 20851820
2010 37253956 19378102 25145561

In [121]:
pop.unstack(level=1)


Out[121]:
year 2000 2010
state
California 33871648 37253956
New York 18976457 19378102
Texas 20851820 25145561

In [122]:
pop.unstack().stack()


Out[122]:
state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [123]:
# optionally specify the name of the data for the column representation
pop_flat = pop.reset_index(name='population')
pop_flat


Out[123]:
state year population
0 California 2000 33871648
1 California 2010 37253956
2 New York 2000 18976457
3 New York 2010 19378102
4 Texas 2000 20851820
5 Texas 2010 25145561

In [124]:
pop_flat.set_index(['state', 'year'])


Out[124]:
population
state year
California 2000 33871648
2010 37253956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561

In [125]:
health_data


Out[125]:
subject Bob Guido Sue
type HR Temp HR Temp HR Temp
year visit
2013 1 27.0 36.7 24.0 34.6 37.0 37.4
2 61.0 38.5 42.0 37.1 27.0 36.2
2014 1 32.0 37.1 38.0 37.6 35.0 38.1
2 37.0 36.3 19.0 37.1 38.0 36.6

In [126]:
data_mean = health_data.mean(level='year')
data_mean


Out[126]:
subject Bob Guido Sue
type HR Temp HR Temp HR Temp
year
2013 44.0 37.6 33.0 35.85 32.0 36.80
2014 34.5 36.7 28.5 37.35 36.5 37.35

In [132]:
data_mean.mean(axis=1, level='type')


Out[132]:
type HR Temp
year
2013 36.333333 36.750000
2014 33.166667 37.133333

In [1]:
import numpy as np
import pandas as pd
def make_df(cols, ind): 
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind] for c in cols}
    return pd.DataFrame(data, ind) 
# example DataFrame
make_df('ABC', range(3))


Out[1]:
A B C
0 A0 B0 C0
1 A1 B1 C1
2 A2 B2 C2

In [2]:
x=[1,2,3]
y=[4,5,6]
z=[7,8,9]
np.concatenate([x, y, z])


Out[2]:
array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [3]:
x = [[1, 2],
     [3, 4]]
np.concatenate([x, x], axis=1)


Out[3]:
array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

In [4]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])


Out[4]:
1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [5]:
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
print(df1)
print(df2)
print(pd.concat([df1, df2]))


    A   B
1  A1  B1
2  A2  B2
    A   B
3  A3  B3
4  A4  B4
    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4

In [6]:
df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])
print(df3)
print(df4)
print(pd.concat([df3, df4], axis='col'))


    A   B
0  A0  B0
1  A1  B1
    C   D
0  C0  D0
1  C1  D1
    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1

In [7]:
x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])
y.index = x.index # make duplicate indices!
print(x)
print(y)
print(pd.concat([x, y]))


    A   B
0  A0  B0
1  A1  B1
    A   B
0  A2  B2
1  A3  B3
    A   B
0  A0  B0
1  A1  B1
0  A2  B2
1  A3  B3

In [8]:
try:
    # verify_integrity=True will disable duplicate indices
    pd.concat([x, y], verify_integrity=True)
except ValueError as e:
    print("ValueError:", e)


('ValueError:', ValueError('Indexes have overlapping values: [0, 1]',))

In [9]:
print(x)
print(y)
print(pd.concat([x, y], ignore_index=True))


    A   B
0  A0  B0
1  A1  B1
    A   B
0  A2  B2
1  A3  B3
    A   B
0  A0  B0
1  A1  B1
2  A2  B2
3  A3  B3

In [10]:
print(x)
print(y)
print(pd.concat([x, y], keys=['x', 'y']))


    A   B
0  A0  B0
1  A1  B1
    A   B
0  A2  B2
1  A3  B3
      A   B
x 0  A0  B0
  1  A1  B1
y 0  A2  B2
  1  A3  B3

In [11]:
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
print(df5)
print(df6)
print(pd.concat([df5, df6]))


    A   B   C
1  A1  B1  C1
2  A2  B2  C2
    B   C   D
3  B3  C3  D3
4  B4  C4  D4
     A   B   C    D
1   A1  B1  C1  NaN
2   A2  B2  C2  NaN
3  NaN  B3  C3   D3
4  NaN  B4  C4   D4

In [12]:
print(df5)
print(df6)
print(pd.concat([df5, df6], join_axes=[df5.columns]))


    A   B   C
1  A1  B1  C1
2  A2  B2  C2
    B   C   D
3  B3  C3  D3
4  B4  C4  D4
     A   B   C
1   A1  B1  C1
2   A2  B2  C2
3  NaN  B3  C3
4  NaN  B4  C4

In [13]:
print(df1)
print(df2)
print(df1.append(df2))  # return a copy


    A   B
1  A1  B1
2  A2  B2
    A   B
3  A3  B3
4  A4  B4
    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4

In [14]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})
print(df1)
print(df2)


  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
  employee  hire_date
0     Lisa       2004
1      Bob       2008
2     Jake       2012
3      Sue       2014

In [15]:
df3 = pd.merge(df1, df2)
df3


Out[15]:
employee group hire_date
0 Bob Accounting 2008
1 Jake Engineering 2012
2 Lisa Engineering 2004
3 Sue HR 2014

In [16]:
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                    'supervisor': ['Carly', 'Guido', 'Steve']})
print(df3)
print(df4)
print(pd.merge(df3, df4))


  employee        group  hire_date
0      Bob   Accounting       2008
1     Jake  Engineering       2012
2     Lisa  Engineering       2004
3      Sue           HR       2014
         group supervisor
0   Accounting      Carly
1  Engineering      Guido
2           HR      Steve
  employee        group  hire_date supervisor
0      Bob   Accounting       2008      Carly
1     Jake  Engineering       2012      Guido
2     Lisa  Engineering       2004      Guido
3      Sue           HR       2014      Steve

In [17]:
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting',
                              'Engineering', 'Engineering', 'HR', 'HR'],
                    'skills': ['math', 'spreadsheets', 'coding', 'linux',
                               'spreadsheets', 'organization']})
print(df1)
print(df5)
print(pd.merge(df1, df5))


  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
         group        skills
0   Accounting          math
1   Accounting  spreadsheets
2  Engineering        coding
3  Engineering         linux
4           HR  spreadsheets
5           HR  organization
  employee        group        skills
0      Bob   Accounting          math
1      Bob   Accounting  spreadsheets
2     Jake  Engineering        coding
3     Jake  Engineering         linux
4     Lisa  Engineering        coding
5     Lisa  Engineering         linux
6      Sue           HR  spreadsheets
7      Sue           HR  organization

In [18]:
print(df1)
print(df2)
print(pd.merge(df1, df2, on='employee'))


  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
  employee  hire_date
0     Lisa       2004
1      Bob       2008
2     Jake       2012
3      Sue       2014
  employee        group  hire_date
0      Bob   Accounting       2008
1     Jake  Engineering       2012
2     Lisa  Engineering       2004
3      Sue           HR       2014

In [19]:
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'salary': [70000, 80000, 120000, 90000]})
print(df1)
print(df3)
print(pd.merge(df1, df3, left_on="employee", right_on="name"))


  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
   name  salary
0   Bob   70000
1  Jake   80000
2  Lisa  120000
3   Sue   90000
  employee        group  name  salary
0      Bob   Accounting   Bob   70000
1     Jake  Engineering  Jake   80000
2     Lisa  Engineering  Lisa  120000
3      Sue           HR   Sue   90000

In [20]:
pd.merge(df1, df3, left_on="employee", right_on="name").drop('name', axis=1)


Out[20]:
employee group salary
0 Bob Accounting 70000
1 Jake Engineering 80000
2 Lisa Engineering 120000
3 Sue HR 90000

In [21]:
df1a = df1.set_index('employee')
df2a = df2.set_index('employee')
print(df1a)
print(df2a)


                group
employee             
Bob        Accounting
Jake      Engineering
Lisa      Engineering
Sue                HR
          hire_date
employee           
Lisa           2004
Bob            2008
Jake           2012
Sue            2014

In [22]:
print(df1a)
print(df2a)
print(pd.merge(df1a, df2a, left_index=True, right_index=True))


                group
employee             
Bob        Accounting
Jake      Engineering
Lisa      Engineering
Sue                HR
          hire_date
employee           
Lisa           2004
Bob            2008
Jake           2012
Sue            2014
                group  hire_date
employee                        
Lisa      Engineering       2004
Bob        Accounting       2008
Jake      Engineering       2012
Sue                HR       2014

In [23]:
print(df1a)
print(df2a)
# For convenience, DataFrames implement the join() method
# which performs a merge that defaults to joining on indices:
print(df1a.join(df2a))


                group
employee             
Bob        Accounting
Jake      Engineering
Lisa      Engineering
Sue                HR
          hire_date
employee           
Lisa           2004
Bob            2008
Jake           2012
Sue            2014
                group  hire_date
employee                        
Bob        Accounting       2008
Jake      Engineering       2012
Lisa      Engineering       2004
Sue                HR       2014

In [24]:
print(df1a)
print(df3)
print(pd.merge(df1a, df3, left_index=True, right_on='name'))


                group
employee             
Bob        Accounting
Jake      Engineering
Lisa      Engineering
Sue                HR
   name  salary
0   Bob   70000
1  Jake   80000
2  Lisa  120000
3   Sue   90000
         group  name  salary
0   Accounting   Bob   70000
1  Engineering  Jake   80000
2  Engineering  Lisa  120000
3           HR   Sue   90000

In [25]:
df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],
                    'food': ['fish', 'beans', 'bread']},
                    columns=['name', 'food'])
df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],
                    'drink': ['wine', 'beer']},
                    columns=['name', 'drink'])
print(df6)
print(df7)
print(pd.merge(df6, df7))


    name   food
0  Peter   fish
1   Paul  beans
2   Mary  bread
     name drink
0    Mary  wine
1  Joseph  beer
   name   food drink
0  Mary  bread  wine

In [26]:
pd.merge(df6, df7, how='inner')


Out[26]:
name food drink
0 Mary bread wine

In [27]:
pd.merge(df6, df7, how='outer')


Out[27]:
name food drink
0 Peter fish NaN
1 Paul beans NaN
2 Mary bread wine
3 Joseph NaN beer

In [28]:
pd.merge(df6, df7, how='left')  # Using how='right' works in a similar manner.


Out[28]:
name food drink
0 Peter fish NaN
1 Paul beans NaN
2 Mary bread wine

In [29]:
df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [1, 2, 3, 4]})
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [3, 1, 4, 2]})
print(df8)
print(df9)
print(pd.merge(df8, df9, on="name"))


   name  rank
0   Bob     1
1  Jake     2
2  Lisa     3
3   Sue     4
   name  rank
0   Bob     3
1  Jake     1
2  Lisa     4
3   Sue     2
   name  rank_x  rank_y
0   Bob       1       3
1  Jake       2       1
2  Lisa       3       4
3   Sue       4       2

In [30]:
print(pd.merge(df8, df9, on="name", suffixes=["_L", "_R"]))


   name  rank_L  rank_R
0   Bob       1       3
1  Jake       2       1
2  Lisa       3       4
3   Sue       4       2

In [31]:
# !curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv
# !curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-areas.csv
# !curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-abbrevs.csv


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 57935  100 57935    0     0   7628      0  0:00:07  0:00:07 --:--:--  9950
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   835  100   835    0     0    981      0 --:--:-- --:--:-- --:--:--   983
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   872  100   872    0     0    369      0  0:00:02  0:00:02 --:--:--   369

In [32]:
pop = pd.read_csv('state-population.csv')
areas = pd.read_csv('state-areas.csv')
abbrevs = pd.read_csv('state-abbrevs.csv')
print(pop.head())
print(areas.head())
print(abbrevs.head())


  state/region     ages  year  population
0           AL  under18  2012   1117489.0
1           AL    total  2012   4817528.0
2           AL  under18  2010   1130966.0
3           AL    total  2010   4785570.0
4           AL  under18  2011   1125763.0
        state  area (sq. mi)
0     Alabama          52423
1      Alaska         656425
2     Arizona         114006
3    Arkansas          53182
4  California         163707
        state abbreviation
0     Alabama           AL
1      Alaska           AK
2     Arizona           AZ
3    Arkansas           AR
4  California           CA

In [34]:
merged = pd.merge(pop, abbrevs, how='outer',
                  left_on='state/region', right_on='abbreviation')
merged = merged.drop('abbreviation', 1) # drop duplicate info merged.head()
merged.head()


Out[34]:
state/region ages year population state
0 AL under18 2012 1117489.0 Alabama
1 AL total 2012 4817528.0 Alabama
2 AL under18 2010 1130966.0 Alabama
3 AL total 2010 4785570.0 Alabama
4 AL under18 2011 1125763.0 Alabama

In [35]:
merged.isnull().any()


Out[35]:
state/region    False
ages            False
year            False
population       True
state            True
dtype: bool

In [36]:
merged[merged['population'].isnull()].head()


Out[36]:
state/region ages year population state
2448 PR under18 1990 NaN NaN
2449 PR total 1990 NaN NaN
2450 PR total 1991 NaN NaN
2451 PR under18 1991 NaN NaN
2452 PR total 1993 NaN NaN

In [37]:
merged.loc[merged['state'].isnull(), 'state/region'].unique()


Out[37]:
array(['PR', 'USA'], dtype=object)

In [38]:
merged.loc[merged['state/region'] == 'PR', 'state'] = 'Puerto Rico'
merged.loc[merged['state/region'] == 'USA', 'state'] = 'United States'
merged.isnull().any()


Out[38]:
state/region    False
ages            False
year            False
population       True
state           False
dtype: bool

In [41]:
final = pd.merge(merged, areas, on='state', how='left')
final.head()


Out[41]:
state/region ages year population state area (sq. mi)
0 AL under18 2012 1117489.0 Alabama 52423.0
1 AL total 2012 4817528.0 Alabama 52423.0
2 AL under18 2010 1130966.0 Alabama 52423.0
3 AL total 2010 4785570.0 Alabama 52423.0
4 AL under18 2011 1125763.0 Alabama 52423.0

In [42]:
final.isnull().any()


Out[42]:
state/region     False
ages             False
year             False
population        True
state            False
area (sq. mi)     True
dtype: bool

In [43]:
final['state'][final['area (sq. mi)'].isnull()].unique()


Out[43]:
array(['United States'], dtype=object)

In [44]:
final.dropna(inplace=True)
final.head()


Out[44]:
state/region ages year population state area (sq. mi)
0 AL under18 2012 1117489.0 Alabama 52423.0
1 AL total 2012 4817528.0 Alabama 52423.0
2 AL under18 2010 1130966.0 Alabama 52423.0
3 AL total 2010 4785570.0 Alabama 52423.0
4 AL under18 2011 1125763.0 Alabama 52423.0

In [45]:
final['area (sq. mi)'].isnull().any()


Out[45]:
False

In [46]:
data2010 = final.query("year == 2010 & ages == 'total'")
data2010.head()


Out[46]:
state/region ages year population state area (sq. mi)
3 AL total 2010 4785570.0 Alabama 52423.0
91 AK total 2010 713868.0 Alaska 656425.0
101 AZ total 2010 6408790.0 Arizona 114006.0
189 AR total 2010 2922280.0 Arkansas 53182.0
197 CA total 2010 37333601.0 California 163707.0

In [47]:
data2010.set_index('state', inplace=True)
density = data2010['population'] / data2010['area (sq. mi)']
density.sort_values(ascending=False, inplace=True)
density.head()


Out[47]:
state
District of Columbia    8898.897059
Puerto Rico             1058.665149
New Jersey              1009.253268
Rhode Island             681.339159
Connecticut              645.600649
dtype: float64

In [48]:
density.tail()


Out[48]:
state
South Dakota    10.583512
North Dakota     9.537565
Montana          6.736171
Wyoming          5.768079
Alaska           1.087509
dtype: float64

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
planets = sns.load_dataset('planets')
planets.shape


Out[1]:
(1035, 6)

In [2]:
planets.head()


Out[2]:
method number orbital_period mass distance year
0 Radial Velocity 1 269.300 7.10 77.40 2006
1 Radial Velocity 1 874.774 2.21 56.95 2008
2 Radial Velocity 1 763.000 2.60 19.84 2011
3 Radial Velocity 1 326.030 19.40 110.62 2007
4 Radial Velocity 1 516.220 10.50 119.47 2009

In [3]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
ser


Out[3]:
0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64

In [4]:
ser.mean(), ser.sum()


Out[4]:
(0.5623850983416314, 2.811925491708157)

In [5]:
df = pd.DataFrame({'A': rng.rand(5), 'B': rng.rand(5)})
df


Out[5]:
A B
0 0.155995 0.020584
1 0.058084 0.969910
2 0.866176 0.832443
3 0.601115 0.212339
4 0.708073 0.181825

In [6]:
df.mean()


Out[6]:
A    0.477888
B    0.443420
dtype: float64

In [7]:
df.mean(axis='columns')


Out[7]:
0    0.088290
1    0.513997
2    0.849309
3    0.406727
4    0.444949
dtype: float64

In [8]:
planets.dropna().describe()


Out[8]:
number orbital_period mass distance year
count 498.00000 498.000000 498.000000 498.000000 498.000000
mean 1.73494 835.778671 2.509320 52.068213 2007.377510
std 1.17572 1469.128259 3.636274 46.596041 4.167284
min 1.00000 1.328300 0.003600 1.350000 1989.000000
25% 1.00000 38.272250 0.212500 24.497500 2005.000000
50% 1.00000 357.000000 1.245000 39.940000 2009.000000
75% 2.00000 999.600000 2.867500 59.332500 2011.000000
max 6.00000 17337.500000 25.000000 354.000000 2014.000000

In [10]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data': range(6)}, columns=['key', 'data'])
df


Out[10]:
key data
0 A 0
1 B 1
2 C 2
3 A 3
4 B 4
5 C 5

In [11]:
df.groupby('key').sum()


Out[11]:
data
key
A 3
B 5
C 7

In [12]:
planets.groupby('method')


Out[12]:
<pandas.core.groupby.DataFrameGroupBy object at 0x116cc4750>

In [13]:
planets.groupby('method')['orbital_period']


Out[13]:
<pandas.core.groupby.SeriesGroupBy object at 0x116cc4190>

In [14]:
planets.groupby('method')['orbital_period'].median()


Out[14]:
method
Astrometry                         631.180000
Eclipse Timing Variations         4343.500000
Imaging                          27500.000000
Microlensing                      3300.000000
Orbital Brightness Modulation        0.342887
Pulsar Timing                       66.541900
Pulsation Timing Variations       1170.000000
Radial Velocity                    360.200000
Transit                              5.714932
Transit Timing Variations           57.011000
Name: orbital_period, dtype: float64

In [15]:
for (method, group) in planets.groupby('method'):
    print("{0:30s} shape={1}".format(method, group.shape))


Astrometry                     shape=(2, 6)
Eclipse Timing Variations      shape=(9, 6)
Imaging                        shape=(38, 6)
Microlensing                   shape=(23, 6)
Orbital Brightness Modulation  shape=(3, 6)
Pulsar Timing                  shape=(5, 6)
Pulsation Timing Variations    shape=(1, 6)
Radial Velocity                shape=(553, 6)
Transit                        shape=(397, 6)
Transit Timing Variations      shape=(4, 6)

In [17]:
planets.groupby('method')['year'].describe().unstack()


Out[17]:
count mean std min 25% 50% 75% max
method
Astrometry 2.0 2011.500000 2.121320 2010.0 2010.75 2011.5 2012.25 2013.0
Eclipse Timing Variations 9.0 2010.000000 1.414214 2008.0 2009.00 2010.0 2011.00 2012.0
Imaging 38.0 2009.131579 2.781901 2004.0 2008.00 2009.0 2011.00 2013.0
Microlensing 23.0 2009.782609 2.859697 2004.0 2008.00 2010.0 2012.00 2013.0
Orbital Brightness Modulation 3.0 2011.666667 1.154701 2011.0 2011.00 2011.0 2012.00 2013.0
Pulsar Timing 5.0 1998.400000 8.384510 1992.0 1992.00 1994.0 2003.00 2011.0
Pulsation Timing Variations 1.0 2007.000000 NaN 2007.0 2007.00 2007.0 2007.00 2007.0
Radial Velocity 553.0 2007.518987 4.249052 1989.0 2005.00 2009.0 2011.00 2014.0
Transit 397.0 2011.236776 2.077867 2002.0 2010.00 2012.0 2013.00 2014.0
Transit Timing Variations 4.0 2012.500000 1.290994 2011.0 2011.75 2012.5 2013.25 2014.0

In [18]:
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])
df


Out[18]:
key data1 data2
0 A 0 5
1 B 1 0
2 C 2 3
3 A 3 3
4 B 4 7
5 C 5 9

In [19]:
df.groupby('key').aggregate(['min', np.median, max])


Out[19]:
data1 data2
min median max min median max
key
A 0 1.5 3 3 4.0 5
B 1 2.5 4 0 3.5 7
C 2 3.5 5 3 6.0 9

In [20]:
df.groupby('key').aggregate({'data1': 'min', 'data2': 'max'})


Out[20]:
data1 data2
key
A 0 5
B 1 7
C 2 9

In [21]:
print(df.groupby('key').std())
print(df.groupby('key').filter(lambda x: x['data2'].std() > 4))


       data1     data2
key                   
A    2.12132  1.414214
B    2.12132  4.949747
C    2.12132  4.242641
  key  data1  data2
1   B      1      0
2   C      2      3
4   B      4      7
5   C      5      9

In [22]:
df.groupby('key').transform(lambda x: x - x.mean())


Out[22]:
data1 data2
0 -1.5 1.0
1 -1.5 -3.5
2 -1.5 -3.0
3 1.5 -1.0
4 1.5 3.5
5 1.5 3.0

In [26]:
def norm_by_data2(x):
    # x is a DataFrame of group values
    x['data1'] /= x['data2'].sum()
    return x
print(df.groupby('key').apply(norm_by_data2))


  key     data1  data2
0   A  0.000000      5
1   B  0.142857      0
2   C  0.166667      3
3   A  0.375000      3
4   B  0.571429      7
5   C  0.416667      9

In [27]:
L=[0,1,0,1,2,0]
print(df)
print(df.groupby(L).sum())


  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
   data1  data2
0      7     17
1      4      3
2      4      7

In [28]:
df2 = df.set_index('key')
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}
print(df2)
print(df2.groupby(mapping).sum())


     data1  data2
key              
A        0      5
B        1      0
C        2      3
A        3      3
B        4      7
C        5      9
           data1  data2
consonant     12     19
vowel          3      8

In [29]:
df2.groupby(str.lower).mean()


Out[29]:
data1 data2
a 1.5 4.0
b 2.5 3.5
c 3.5 6.0

In [30]:
df2.groupby([str.lower, mapping]).mean()


Out[30]:
data1 data2
a vowel 1.5 4.0
b consonant 2.5 3.5
c consonant 3.5 6.0

In [31]:
decade = 10 * (planets['year'] // 10)
decade = decade.astype(str) + 's'
decade.name = 'decade'
planets.groupby(['method', decade])['number'].sum().unstack().fillna(0)


Out[31]:
decade 1980s 1990s 2000s 2010s
method
Astrometry 0.0 0.0 0.0 2.0
Eclipse Timing Variations 0.0 0.0 5.0 10.0
Imaging 0.0 0.0 29.0 21.0
Microlensing 0.0 0.0 12.0 15.0
Orbital Brightness Modulation 0.0 0.0 0.0 5.0
Pulsar Timing 0.0 9.0 1.0 1.0
Pulsation Timing Variations 0.0 0.0 1.0 0.0
Radial Velocity 1.0 52.0 475.0 424.0
Transit 0.0 0.0 64.0 712.0
Transit Timing Variations 0.0 0.0 0.0 9.0

In [42]:
import numpy as np
import pandas as pd
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head()


Out[42]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True

In [43]:
titanic.groupby('sex')['survived'].mean()


Out[43]:
sex
female    0.742038
male      0.188908
Name: survived, dtype: float64

In [45]:
titanic.groupby(['sex', 'class'])['survived'].mean().unstack()


Out[45]:
class First Second Third
sex
female 0.968085 0.921053 0.500000
male 0.368852 0.157407 0.135447

In [46]:
titanic.pivot_table('survived', index='sex', columns='class')


Out[46]:
class First Second Third
sex
female 0.968085 0.921053 0.500000
male 0.368852 0.157407 0.135447

In [47]:
age = pd.cut(titanic['age'], [0, 18, 80])
titanic.pivot_table('survived', ['sex', age], 'class')


Out[47]:
class First Second Third
sex age
female (0, 18] 0.909091 1.000000 0.511628
(18, 80] 0.972973 0.900000 0.423729
male (0, 18] 0.800000 0.600000 0.215686
(18, 80] 0.375000 0.071429 0.133663

In [50]:
fare = pd.qcut(titanic['fare'], 2)  # split into 2 equal-size groups
titanic.pivot_table('survived', ['sex', age], [fare, 'class'])


Out[50]:
fare [0, 14.454] (14.454, 512.329]
class First Second Third First Second Third
sex age
female (0, 18] NaN 1.000000 0.714286 0.909091 1.000000 0.318182
(18, 80] NaN 0.880000 0.444444 0.972973 0.914286 0.391304
male (0, 18] NaN 0.000000 0.260870 0.800000 0.818182 0.178571
(18, 80] 0.0 0.098039 0.125000 0.391304 0.030303 0.192308

In [51]:
titanic.pivot_table(index='sex', columns='class',
                    aggfunc={'survived':np.sum, 'fare':'mean'})


Out[51]:
fare survived
class First Second Third First Second Third
sex
female 106.125798 21.970121 16.118810 91.0 70.0 72.0
male 67.226127 19.741782 12.661633 45.0 17.0 47.0

In [52]:
titanic.pivot_table('survived', index='sex', columns='class', margins=True)


Out[52]:
class First Second Third All
sex
female 0.968085 0.921053 0.500000 0.742038
male 0.368852 0.157407 0.135447 0.188908
All 0.629630 0.472826 0.242363 0.383838

In [53]:
# !curl -O https://raw.githubusercontent.com/jakevdp/data-CDCbirths/master/births.csv


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  258k  100  258k    0     0   9195      0  0:00:28  0:00:28 --:--:--  6838

In [54]:
births = pd.read_csv('births.csv')
births.head()


Out[54]:
year month day gender births
0 1969 1 1 F 4046
1 1969 1 1 M 4440
2 1969 1 2 F 4454
3 1969 1 2 M 4548
4 1969 1 3 F 4548

In [55]:
births['decade'] = 10 * (births['year'] // 10)
births.pivot_table('births', index='decade', columns='gender', aggfunc='sum')


Out[55]:
gender F M
decade
1960 1753634 1846572
1970 16263075 17121550
1980 18310351 19243452
1990 19479454 20420553
2000 18229309 19106428

In [56]:
%matplotlib inline
import matplotlib.pyplot as plt
sns.set() # use Seaborn styles
births.pivot_table('births', index='year', columns='gender', aggfunc='sum').plot()
plt.ylabel('total births per year')


Out[56]:
<matplotlib.text.Text at 0x119b83090>

In [57]:
quartiles = np.percentile(births['births'], [25, 50, 75])
mu = quartiles[1]
sig = 0.74 * (quartiles[2] - quartiles[0])
births = births.query('(births > @mu - 5 * @sig) & (births < @mu + 5 * @sig)')
births['day'] = births['day'].astype(int)
births.index = pd.to_datetime(10000 * births.year +
                              100 * births.month +
                              births.day, format='%Y%m%d')
births['dayofweek'] = births.index.dayofweek

births.pivot_table('births', index='dayofweek',columns='decade', aggfunc='mean').plot()
plt.gca().set_xticklabels(['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])
plt.ylabel('mean births by day')



In [58]:
births_by_date = births.pivot_table('births', [births.index.month, births.index.day])
births_by_date.head()


Out[58]:
1  1    4009.225
   2    4247.400
   3    4500.900
   4    4571.350
   5    4603.625
Name: births, dtype: float64

In [59]:
births_by_date.index = [pd.datetime(2012, month, day) for (month, day) in births_by_date.index]
births_by_date.head()


Out[59]:
2012-01-01    4009.225
2012-01-02    4247.400
2012-01-03    4500.900
2012-01-04    4571.350
2012-01-05    4603.625
Name: births, dtype: float64

In [60]:
fig, ax = plt.subplots(figsize=(12, 4))
births_by_date.plot(ax=ax)


Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a211e10>

In [1]:
import numpy as np
import pandas as pd
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']
names = pd.Series(data)
names


Out[1]:
0    peter
1     Paul
2     None
3     MARY
4    gUIDO
dtype: object

In [2]:
names.str.capitalize()


Out[2]:
0    Peter
1     Paul
2     None
3     Mary
4    Guido
dtype: object

In [3]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
                   'Eric Idle', 'Terry Jones', 'Michael Palin'])
monte.str.lower()


Out[3]:
0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

In [4]:
monte.str.len()


Out[4]:
0    14
1    11
2    13
3     9
4    11
5    13
dtype: int64

In [5]:
monte.str.startswith('T')


Out[5]:
0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

In [6]:
monte.str.split()


Out[6]:
0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object

In [8]:
monte.str.extract('([A-Za-z]+)')


/Users/kris/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:1: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)
  if __name__ == '__main__':
Out[8]:
0     Graham
1       John
2      Terry
3       Eric
4      Terry
5    Michael
dtype: object

In [9]:
monte.str.findall('^[^AEIOU].*[^aeiou]$')


Out[9]:
0    [Graham Chapman]
1                  []
2     [Terry Gilliam]
3                  []
4       [Terry Jones]
5     [Michael Palin]
dtype: object

In [10]:
monte.str[0:3]


Out[10]:
0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

In [11]:
monte.str.split().str.get(-1)


Out[11]:
0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object

In [12]:
full_monte = pd.DataFrame({'name': monte, 
                           'info': ['B|C|D', 'B|D', 'A|C', 'B|D', 'B|C', 'B|C|D']})
full_monte


Out[12]:
info name
0 B|C|D Graham Chapman
1 B|D John Cleese
2 A|C Terry Gilliam
3 B|D Eric Idle
4 B|C Terry Jones
5 B|C|D Michael Palin

In [13]:
full_monte['info'].str.get_dummies('|')


Out[13]:
A B C D
0 0 1 1 1
1 0 1 0 1
2 1 0 1 0
3 0 1 0 1
4 0 1 1 0
5 0 1 1 1

In [14]:
# !curl -O http://openrecipes.s3.amazonaws.com/openrecipes.txt


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    20  100    20    0     0      7      0  0:00:02  0:00:02 --:--:--     8

In [16]:
# read the entire file into a Python array
with open('openrecipes.txt', 'r') as f:
    # Extract each line
    data = (line.strip() for line in f)
    # Reformat so each line is the element of a list
    data_json = "[{0}]".format(','.join(data))
# read the result as a JSON
recipes = pd.read_json(data_json)
recipes.shape


Out[16]:
(1042, 9)

In [17]:
recipes.head()


Out[17]:
cookTime datePublished description image ingredients name prepTime recipeYield url
0 PT 2013-04-01 Got leftover Easter eggs? Got leftover East... http://static.thepioneerwoman.com/cooking/file... 12 whole Hard Boiled Eggs\n1/2 cup Mayonnaise\... Easter Leftover Sandwich PT15M 8 http://thepioneerwoman.com/cooking/2013/04/eas...
1 PT10M 2011-06-06 I finally have basil in my garden. Basil I can... http://static.thepioneerwoman.com/cooking/file... 3/4 cups Fresh Basil Leaves\n1/2 cup Grated Pa... Pasta with Pesto Cream Sauce PT6M 8 http://thepioneerwoman.com/cooking/2011/06/pas...
2 PT15M 2011-09-15 This was yummy. And easy. And pretty! And it t... http://static.thepioneerwoman.com/cooking/file... 2 whole Pork Tenderloins\n Salt And Pepper, to... Herb Roasted Pork Tenderloin with Preserves PT5M 12 http://thepioneerwoman.com/cooking/2011/09/her...
3 PT20M 2012-04-23 I made this for a late lunch Saturday, and it ... http://static.thepioneerwoman.com/cooking/file... 1 pound Penne\n4 whole Boneless, Skinless Chic... Chicken Florentine Pasta PT10M 10 http://thepioneerwoman.com/cooking/2012/04/chi...
4 PT 2011-06-13 Iced coffee is my life. When I wake up, often ... http://static.thepioneerwoman.com/cooking/file... 1 pound Ground Coffee (good, Rich Roast)\n8 qu... Perfect Iced Coffee PT8H 24 http://thepioneerwoman.com/cooking/2011/06/per...

In [18]:
recipes.ingredients.str.len().describe()


Out[18]:
count    1042.000000
mean      358.645873
std       187.332133
min        22.000000
25%       246.250000
50%       338.000000
75%       440.000000
max      3160.000000
Name: ingredients, dtype: float64

In [19]:
recipes.name[np.argmax(recipes.ingredients.str.len())]


Out[19]:
u'A Nice Berry Pie'

In [20]:
recipes.description.str.contains('[Bb]reakfast').sum()


Out[20]:
11

In [21]:
recipes.ingredients.str.contains('[Cc]innamon').sum()


Out[21]:
79

In [22]:
recipes.ingredients.str.contains('[Cc]inamon').sum()


Out[22]:
0

In [23]:
import re
spice_list = ['salt', 'pepper', 'oregano', 'sage', 'parsley',
              'rosemary', 'tarragon', 'thyme', 'paprika', 'cumin']
spice_df = pd.DataFrame(dict((spice, recipes.ingredients.str.contains(spice, re.IGNORECASE)) 
                             for spice in spice_list))
spice_df.head()


Out[23]:
cumin oregano paprika parsley pepper rosemary sage salt tarragon thyme
0 False False False False False False False False False False
1 False False False False False False False False False False
2 False False False False False False False False False False
3 False False False False False False False False False False
4 False False False False False False False False False False

In [26]:
spice_df.sum(axis=0)


Out[26]:
cumin        41
oregano      15
paprika      16
parsley      17
pepper      133
rosemary     12
sage         19
salt        458
tarragon      8
thyme        27
dtype: int64

In [28]:
selection = spice_df.query('salt & pepper & cumin')
len(selection)


Out[28]:
17

In [29]:
recipes.name[selection.index]


Out[29]:
570     Pierce Street Vegetarian Chili Recipe
641                 Sesame Yogurt Pasta Salad
655                                Palak Daal
670          My Favorite Grilled Kabob Recipe
680            Jamaican Veggie Patties Recipe
688                       Tempeh Curry Recipe
690                         Red Posole Recipe
691                        Feisty Green Beans
759         Chile Lime Tequila Popcorn Recipe
812                      Spiced Tomato Gratin
814                      New Year Noodle Soup
817         Winter Vegetable &amp; Tofu Korma
818                    Spiced Coconut Spinach
826          Moroccan Mint Roasted Vegetables
838        Morrocan Carrot and Chickpea Salad
1016                            Dukkah Recipe
1032        Baked Sweet Potato Falafel Recipe
Name: name, dtype: object

In [1]:
from datetime import datetime
datetime(year=2015, month=7, day=4)


Out[1]:
datetime.datetime(2015, 7, 4, 0, 0)

In [2]:
from dateutil import parser
date = parser.parse("4th of July, 2015")
date


Out[2]:
datetime.datetime(2015, 7, 4, 0, 0)

In [3]:
date.strftime('%A')


Out[3]:
'Saturday'

In [4]:
import numpy as np
date = np.array('2015-07-04', dtype=np.datetime64)
date


Out[4]:
array(datetime.date(2015, 7, 4), dtype='datetime64[D]')

In [5]:
date + np.arange(12)


Out[5]:
array(['2015-07-04', '2015-07-05', '2015-07-06', '2015-07-07',
       '2015-07-08', '2015-07-09', '2015-07-10', '2015-07-11',
       '2015-07-12', '2015-07-13', '2015-07-14', '2015-07-15'], dtype='datetime64[D]')

In [6]:
np.datetime64('2015-07-04')


Out[6]:
numpy.datetime64('2015-07-04')

In [7]:
np.datetime64('2015-07-04 12:00')


Out[7]:
numpy.datetime64('2015-07-04T12:00')

In [8]:
np.datetime64('2015-07-04 12:59:59.50', 'ns')


Out[8]:
numpy.datetime64('2015-07-04T12:59:59.500000000')

In [9]:
import pandas as pd
date = pd.to_datetime("4th of July, 2015")
date


Out[9]:
Timestamp('2015-07-04 00:00:00')

In [10]:
date.strftime('%A')


Out[10]:
'Saturday'

In [11]:
date + pd.to_timedelta(np.arange(12), 'D')


Out[11]:
DatetimeIndex(['2015-07-04', '2015-07-05', '2015-07-06', '2015-07-07',
               '2015-07-08', '2015-07-09', '2015-07-10', '2015-07-11',
               '2015-07-12', '2015-07-13', '2015-07-14', '2015-07-15'],
              dtype='datetime64[ns]', freq=None)

In [12]:
index = pd.DatetimeIndex(['2014-07-04', '2014-08-04', '2015-07-04', '2015-08-04'])
data = pd.Series([0, 1, 2, 3], index=index)
data


Out[12]:
2014-07-04    0
2014-08-04    1
2015-07-04    2
2015-08-04    3
dtype: int64

In [13]:
data['2014-07-04':'2015-07-04']


Out[13]:
2014-07-04    0
2014-08-04    1
2015-07-04    2
dtype: int64

In [14]:
data['2015']


Out[14]:
2015-07-04    2
2015-08-04    3
dtype: int64

In [15]:
dates = pd.to_datetime([datetime(2015, 7, 3), '4th of July, 2015',
                        '2015-Jul-6', '07-07-2015', '20150708'])
dates


Out[15]:
DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
               '2015-07-08'],
              dtype='datetime64[ns]', freq=None)

In [16]:
dates.to_period('D')


Out[16]:
PeriodIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
             '2015-07-08'],
            dtype='int64', freq='D')

In [17]:
dates - dates[0]


Out[17]:
TimedeltaIndex(['0 days', '1 days', '3 days', '4 days', '5 days'], dtype='timedelta64[ns]', freq=None)

In [18]:
pd.date_range('2015-07-03', '2015-07-10')


Out[18]:
DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',
               '2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],
              dtype='datetime64[ns]', freq='D')

In [19]:
pd.date_range('2015-07-03', periods=8)


Out[19]:
DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',
               '2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],
              dtype='datetime64[ns]', freq='D')

In [20]:
pd.date_range('2015-07-03', periods=8, freq='H')


Out[20]:
DatetimeIndex(['2015-07-03 00:00:00', '2015-07-03 01:00:00',
               '2015-07-03 02:00:00', '2015-07-03 03:00:00',
               '2015-07-03 04:00:00', '2015-07-03 05:00:00',
               '2015-07-03 06:00:00', '2015-07-03 07:00:00'],
              dtype='datetime64[ns]', freq='H')

In [21]:
pd.period_range('2015-07-03', periods=8, freq='M')


Out[21]:
PeriodIndex(['2015-07', '2015-08', '2015-09', '2015-10', '2015-11', '2015-12',
             '2016-01', '2016-02'],
            dtype='int64', freq='M')

In [22]:
pd.timedelta_range(0, periods=10, freq='H')


Out[22]:
TimedeltaIndex(['00:00:00', '01:00:00', '02:00:00', '03:00:00', '04:00:00',
                '05:00:00', '06:00:00', '07:00:00', '08:00:00', '09:00:00'],
               dtype='timedelta64[ns]', freq='H')

In [23]:
pd.timedelta_range(0, periods=8, freq='2H30T')


Out[23]:
TimedeltaIndex(['00:00:00', '02:30:00', '05:00:00', '07:30:00', '10:00:00',
                '12:30:00', '15:00:00', '17:30:00'],
               dtype='timedelta64[ns]', freq='150T')

In [26]:
from pandas.tseries.offsets import BDay
pd.date_range('2015-07-01', periods=10, freq=BDay())


Out[26]:
DatetimeIndex(['2015-07-01', '2015-07-02', '2015-07-03', '2015-07-06',
               '2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10',
               '2015-07-13', '2015-07-14'],
              dtype='datetime64[ns]', freq='B')

In [1]:
# import pandas_datareader as pdr
# goog = data.DataReader('GOOG', start='2004', end='2016', data_source='google')
# goog = pdr.get_data_yahoo('GOOG')
import pandas as pd
goog = pd.read_csv('goog.csv')
goog['Date'] = pd.to_datetime(goog['Date'])
# goog.sort_values(by='Date', inplace=True)
goog.set_index('Date', inplace=True)
goog.sort_index(inplace=True)
goog.head()


Out[1]:
Open High Low Close Volume
Date
2004-08-19 49.96 51.98 47.93 50.12 -
2004-08-20 50.69 54.49 50.20 54.10 -
2004-08-23 55.32 56.68 54.47 54.65 -
2004-08-24 55.56 55.74 51.73 52.38 -
2004-08-25 52.43 53.95 51.89 52.95 -

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
goog = goog['Close']
goog.plot()


Out[2]:
<matplotlib.axes._subplots.AxesSubplot at 0x11676dbd0>

In [3]:
goog.plot(alpha=0.5, style='-')
goog.resample('BA').mean().plot(style=':')
goog.asfreq('BA').plot(style='--')
plt.legend(['input', 'resample', 'asfreq'], loc='upper left')


Out[3]:
<matplotlib.legend.Legend at 0x118e5f7d0>

In [4]:
# resample() is fundamentally a data aggregation
goog.resample('BA').mean()
# goog['2014'].mean() == 75.825213


Out[4]:
Date
2004-12-31     75.825213
2005-12-30    138.741032
2006-12-29    205.387131
2007-12-31    269.105817
2008-12-31    232.193241
2009-12-31    219.624286
2010-12-31    267.477131
2011-12-30    284.206190
2012-12-31    321.086800
2013-12-31    441.678294
2014-12-31    561.328452
2015-12-31    602.461270
2016-12-30           NaN
Freq: BA-DEC, Name: Close, dtype: float64

In [5]:
# asfreq() is fundamentally a data selection
goog.asfreq('BA')
# goog['2004-12-31'] == 96.29999..


Out[5]:
Date
2004-12-31     96.30
2005-12-30    207.22
2006-12-29    230.01
2007-12-31    345.39
2008-12-31    153.67
2009-12-31    309.68
2010-12-31    296.69
2011-12-30    322.63
2012-12-31    353.34
2013-12-31    559.79
2014-12-31    526.40
2015-12-31    758.88
Freq: BA-DEC, Name: Close, dtype: float64

In [6]:
fig, ax = plt.subplots(2, sharex=True)
data = goog.iloc[:10]
data.asfreq('D').plot(ax=ax[0], marker='o')
data.asfreq('D', method='bfill').plot(ax=ax[1], style='-o')
data.asfreq('D', method='ffill').plot(ax=ax[1], style='--o')
ax[1].legend(["back-fill", "forward-fill"]);



In [7]:
data.asfreq('D')


Out[7]:
Date
2004-08-19    50.12
2004-08-20    54.10
2004-08-21      NaN
2004-08-22      NaN
2004-08-23    54.65
2004-08-24    52.38
2004-08-25    52.95
2004-08-26    53.90
2004-08-27    53.02
2004-08-28      NaN
2004-08-29      NaN
2004-08-30    50.95
2004-08-31    51.13
2004-09-01    50.07
Freq: D, Name: Close, dtype: float64

In [8]:
fig, ax = plt.subplots(3, sharey=True)
# apply a frequency to the data
goog = goog.asfreq('D', method='pad')
goog.plot(ax=ax[0])
goog.shift(900).plot(ax=ax[1])  # shift the data
goog.tshift(900).plot(ax=ax[2])  # shift the date index
# legends and annotations
local_max = pd.to_datetime('2007-11-05')
offset = pd.Timedelta(900, 'D')
ax[0].legend(['input'], loc=2)
ax[0].get_xticklabels()[4].set(weight='heavy', color='red')
ax[0].axvline(local_max, alpha=0.3, color='red')
ax[1].legend(['shift(900)'], loc=2)
ax[1].get_xticklabels()[4].set(weight='heavy', color='red')
ax[1].axvline(local_max + offset, alpha=0.3, color='red')
ax[2].legend(['tshift(900)'], loc=2)
ax[2].get_xticklabels()[1].set(weight='heavy', color='red')
ax[2].axvline(local_max + offset, alpha=0.3, color='red');



In [15]:
ROI = 100 * (goog.tshift(-365) / goog - 1)
ROI.plot()
plt.ylabel('% Return on Investment');



In [16]:
rolling = goog.rolling(365, center=True)
data = pd.DataFrame({'input': goog,
                     'one-year rolling_mean': rolling.mean(),
                     'one-year rolling_std': rolling.std()})
ax = data.plot(style=['-', '--', ':'])
ax.lines[0].set_alpha(0.3)



In [19]:
# !curl -o FremontBridge.csv https://data.seattle.gov/api/views/65db-xm6k/rows.csv?accessType=DOWNLOAD


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  767k    0  767k    0     0   2292      0 --:--:--  0:05:42 --:--:--  3688
curl: (18) transfer closed with outstanding read data remaining

In [20]:
data = pd.read_csv('FremontBridge.csv', index_col='Date', parse_dates=True)
data.head()


Out[20]:
Fremont Bridge West Sidewalk Fremont Bridge East Sidewalk
Date
2012-10-03 00:00:00 4.0 9.0
2012-10-03 01:00:00 4.0 6.0
2012-10-03 02:00:00 1.0 1.0
2012-10-03 03:00:00 2.0 3.0
2012-10-03 04:00:00 6.0 1.0

In [22]:
data.columns = ['west', 'east']
data['total'] = data.eval('west + east')
data.dropna().describe()


Out[22]:
west east total
count 27322.000000 27322.000000 27322.000000
mean 56.330210 53.597906 109.928117
std 71.597664 76.477755 134.082446
min 0.000000 0.000000 0.000000
25% 7.000000 7.000000 16.000000
50% 32.000000 28.000000 63.000000
75% 75.000000 66.000000 145.000000
max 698.000000 667.000000 946.000000

In [23]:
data.plot(alpha=0.5)
plt.ylabel('hourly bicycle count')


Out[23]:
<matplotlib.text.Text at 0x11b0a8250>

In [25]:
weekly = data.resample('W').sum()
weekly.plot(style=[':', '--', '-'])
plt.ylabel('weekly bicycle count')


Out[25]:
<matplotlib.text.Text at 0x113b895d0>

In [26]:
daily = data.resample('D').sum()
daily.rolling(30, center=True).sum().plot(style=[':', '--', '-'])
plt.ylabel('rolling mean hourly count')


Out[26]:
<matplotlib.text.Text at 0x11ace5590>

In [28]:
daily.rolling(50, center=True, win_type='gaussian').sum(std=10).plot(style=[':', '--', '-'])


Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x11c791a90>

In [30]:
import numpy as np
by_time = data.groupby(data.index.time).mean()
hourly_ticks = 4 * 60 * 60 * np.arange(6)
by_time.plot(xticks=hourly_ticks, style=[':', '--', '-'])


Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x11c077a10>

In [31]:
data.groupby(data.index.time).mean()


Out[31]:
west east total
00:00:00 6.063213 7.577700 13.640913
01:00:00 3.409131 3.888499 7.297629
02:00:00 2.673703 2.402814 5.076517
03:00:00 1.919227 1.561018 3.480246
04:00:00 3.588235 2.618086 6.206321
05:00:00 16.401229 8.104478 24.505707
06:00:00 59.928885 29.070237 88.999122
07:00:00 155.978929 59.007024 214.985953
08:00:00 209.612818 93.088674 302.701493
09:00:00 117.459578 68.055360 185.514938
10:00:00 58.658172 40.335677 98.993849
11:00:00 49.271529 37.268014 86.539543
12:00:00 51.216169 41.634446 92.850615
13:00:00 53.350307 46.288850 99.639157
14:00:00 56.187006 52.187884 108.374890
15:00:00 64.713784 68.606673 133.320457
16:00:00 87.612478 123.700351 211.312830
17:00:00 119.531634 248.418278 367.949912
18:00:00 98.266257 171.579965 269.846221
19:00:00 54.417399 78.551845 132.969244
20:00:00 32.968366 43.506151 76.474517
21:00:00 22.920914 26.985062 49.905975
22:00:00 15.503515 18.718805 34.222320
23:00:00 10.215290 13.340070 23.555360

In [32]:
data.index


Out[32]:
DatetimeIndex(['2012-10-03 00:00:00', '2012-10-03 01:00:00',
               '2012-10-03 02:00:00', '2012-10-03 03:00:00',
               '2012-10-03 04:00:00', '2012-10-03 05:00:00',
               '2012-10-03 06:00:00', '2012-10-03 07:00:00',
               '2012-10-03 08:00:00', '2012-10-03 09:00:00',
               ...
               '2015-11-15 06:00:00', '2015-11-15 07:00:00',
               '2015-11-15 08:00:00', '2015-11-15 09:00:00',
               '2015-11-15 10:00:00', '2015-11-15 11:00:00',
               '2015-11-15 12:00:00', '2015-11-15 13:00:00',
               '2015-11-15 14:00:00', '2015-11-15 15:00:00'],
              dtype='datetime64[ns]', name=u'Date', length=27328, freq=None)

In [33]:
by_weekday = data.groupby(data.index.dayofweek).mean()
by_weekday.index = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
by_weekday.plot(style=[':', '--', '-'])


Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x11d9bc150>

In [35]:
weekend = np.where(data.index.weekday < 5, 'Weekday', 'Weekend')
by_time = data.groupby([weekend, data.index.time]).mean()
fig, ax = plt.subplots(1, 2, figsize=(14, 5))
by_time.ix['Weekday'].plot(ax=ax[0], title='weekday', style=[':', '--', '-'], xticks=hourly_ticks)
by_time.ix['Weekend'].plot(ax=ax[1], title='weekend', style=[':', '--', '-'], xticks=hourly_ticks)


Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x11f86d910>

In [36]:
by_time


Out[36]:
west east total
Weekday 00:00:00 5.386224 6.364084 11.750308
01:00:00 2.805658 3.073801 5.879459
02:00:00 2.306273 1.810578 4.116851
03:00:00 1.656827 1.446494 3.103321
04:00:00 4.226322 2.756458 6.982780
05:00:00 21.195572 10.022140 31.217712
06:00:00 80.421894 37.075031 117.496925
07:00:00 211.242312 76.934809 288.177122
08:00:00 280.177122 119.924969 400.102091
09:00:00 145.750000 79.849754 225.599754
10:00:00 58.832512 39.235222 98.067734
11:00:00 42.711823 32.000000 74.711823
12:00:00 42.684729 35.330049 78.014778
13:00:00 44.654367 39.847478 84.501845
14:00:00 48.233702 47.306273 95.539975
15:00:00 60.430504 70.853629 131.284133
16:00:00 95.327183 149.547355 244.874539
17:00:00 144.156212 327.441574 471.597786
18:00:00 119.381304 224.337023 343.718327
19:00:00 64.079951 98.837638 162.917589
20:00:00 37.548585 52.265683 89.814268
21:00:00 26.088561 31.375154 57.463715
22:00:00 17.169742 20.704797 37.874539
23:00:00 10.799508 14.226322 25.025830
Weekend 00:00:00 7.751534 10.604294 18.355828
01:00:00 4.914110 5.920245 10.834356
02:00:00 3.595679 3.888889 7.484568
03:00:00 2.573620 1.846626 4.420245
04:00:00 1.996933 2.273006 4.269939
05:00:00 4.444785 3.322086 7.766871
06:00:00 8.822086 9.107362 17.929448
07:00:00 18.159509 14.297546 32.457055
08:00:00 33.634969 26.162577 59.797546
09:00:00 46.993865 38.677914 85.671779
10:00:00 58.223926 43.076687 101.300613
11:00:00 65.610429 50.389571 116.000000
12:00:00 72.466258 57.337423 129.803681
13:00:00 75.036810 62.352761 137.389571
14:00:00 76.021472 64.361963 140.383436
15:00:00 75.395706 63.003067 138.398773
16:00:00 68.313846 59.043077 127.356923
17:00:00 57.932308 50.738462 108.670769
18:00:00 45.446154 39.606154 85.052308
19:00:00 30.246154 27.806154 58.052308
20:00:00 21.510769 21.593846 43.104615
21:00:00 14.996923 16.003077 31.000000
22:00:00 11.335385 13.750769 25.086154
23:00:00 8.753846 11.123077 19.876923

In [1]:
import numpy as np
rng = np.random.RandomState(42)
x = rng.rand(1E6)
y = rng.rand(1E6)
%timeit x + y


/Users/kris/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  app.launch_new_instance()
/Users/kris/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
1000 loops, best of 3: 1.16 ms per loop

In [2]:
%timeit np.fromiter((xi + yi for xi, yi in zip(x, y)), dtype=x.dtype, count=len(x))


1 loop, best of 3: 421 ms per loop

In [3]:
mask=(x>0.5)&(y<0.5)  # every intermediate step is explicitly allocated in memory

# preferred way
import numexpr
mask_numexpr = numexpr.evaluate('(x > 0.5) & (y < 0.5)')
np.allclose(mask, mask_numexpr)


Out[3]:
True

In [5]:
import pandas as pd
nrows, ncols = 100000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols)) for _ in range(4))

In [6]:
%timeit df1 + df2 + df3 + df4


10 loops, best of 3: 73.8 ms per loop

In [7]:
%timeit pd.eval('df1 + df2 + df3 + df4')


10 loops, best of 3: 39.8 ms per loop

In [8]:
np.allclose(df1 + df2 + df3 + df4, pd.eval('df1 + df2 + df3 + df4'))


Out[8]:
True

In [9]:
df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0, 1000, (100, 3))) for i in range(5))

In [10]:
result1 = -df1 * df2 / (df3 + df4) - df5
result2 = pd.eval('-df1 * df2 / (df3 + df4) - df5')
np.allclose(result1, result2)


Out[10]:
True

In [11]:
result1 = (df1 < df2) & (df2 <= df3) & (df3 != df4)
result2 = pd.eval('df1 < df2 <= df3 != df4')
np.allclose(result1, result2)


Out[11]:
True

In [12]:
result1 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4)
result2 = pd.eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)')
np.allclose(result1, result2)


Out[12]:
True

In [13]:
result3 = pd.eval('(df1 < 0.5) and (df2 < 0.5) or (df3 < df4)')
np.allclose(result1, result3)


Out[13]:
True

In [14]:
result1 = df2.T[0] + df3.iloc[1]
result2 = pd.eval('df2.T[0] + df3.iloc[1]')
np.allclose(result1, result2)


Out[14]:
True

In [15]:
df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])
df.head()


Out[15]:
A B C
0 0.375506 0.406939 0.069938
1 0.069087 0.235615 0.154374
2 0.677945 0.433839 0.652324
3 0.264038 0.808055 0.347197
4 0.589161 0.252418 0.557789

In [16]:
result1 = (df['A'] + df['B']) / (df['C'] - 1)
result2 = pd.eval("(df.A + df.B) / (df.C - 1)")
np.allclose(result1, result2)


Out[16]:
True

In [17]:
result3 = df.eval('(A + B) / (C - 1)')
np.allclose(result1, result3)


Out[17]:
True

In [18]:
df.eval('D = (A + B) / C', inplace=True)
df.head()


Out[18]:
A B C D
0 0.375506 0.406939 0.069938 11.187620
1 0.069087 0.235615 0.154374 1.973796
2 0.677945 0.433839 0.652324 1.704344
3 0.264038 0.808055 0.347197 3.087857
4 0.589161 0.252418 0.557789 1.508776

In [19]:
df.eval('D = (A - B) / C', inplace=True)
df.head()


Out[19]:
A B C D
0 0.375506 0.406939 0.069938 -0.449425
1 0.069087 0.235615 0.154374 -1.078728
2 0.677945 0.433839 0.652324 0.374209
3 0.264038 0.808055 0.347197 -1.566886
4 0.589161 0.252418 0.557789 0.603708

In [20]:
column_mean = df.mean(1)
result1 = df['A'] + column_mean
result2 = df.eval('A + @column_mean')
np.allclose(result1, result2)


Out[20]:
True

In [21]:
result1 = df[(df.A < 0.5) & (df.B < 0.5)]
result2 = pd.eval('df[(df.A < 0.5) & (df.B < 0.5)]')
np.allclose(result1, result2)


Out[21]:
True

In [22]:
result2 = df.query('A < 0.5 and B < 0.5')
np.allclose(result1, result2)


Out[22]:
True

In [23]:
Cmean = df['C'].mean()
result1 = df[(df.A < Cmean) & (df.B < Cmean)]
result2 = df.query('A < @Cmean and B < @Cmean')
np.allclose(result1, result2)


Out[23]:
True

In [ ]:
# The benefit of eval/query is mainly in the saved memory, and the sometimes cleaner syntax they offer.