notebook.community

Edit and run



In [2]:

    
import numpy as np
import pandas as pd



In [3]:

    
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data









    Out[3]:





0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64



In [4]:

    
# get np array
data.values









    Out[4]:





array([ 0.25,  0.5 ,  0.75,  1.  ])



In [5]:

    
data.index









    Out[5]:





RangeIndex(start=0, stop=4, step=1)



In [6]:

    
data[1]









    Out[6]:





0.5



In [7]:

    
data[1:3]









    Out[7]:





1    0.50
2    0.75
dtype: float64



In [8]:

    
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
data









    Out[8]:





a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64



In [9]:

    
data['b']









    Out[9]:





0.5



In [10]:

    
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[2, 5, 3, 7])
data









    Out[10]:





2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64



In [11]:

    
data[5]









    Out[11]:





0.5



In [12]:

    
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population









    Out[12]:





California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
dtype: int64



In [13]:

    
population['California']









    Out[13]:





38332521



In [14]:

    
population['California':'Illinois']  # inclusive









    Out[14]:





California    38332521
Florida       19552860
Illinois      12882135
dtype: int64



In [15]:

    
pd.Series(5, index=[100, 200, 300])









    Out[15]:





100    5
200    5
300    5
dtype: int64



In [16]:

    
pd.Series({2:'a', 1:'b', 3:'c'})









    Out[16]:





1    b
2    a
3    c
dtype: object



In [17]:

    
pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2])









    Out[17]:





3    c
2    a
dtype: object



In [18]:

    
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area









    Out[18]:





California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
dtype: int64



In [19]:

    
states = pd.DataFrame({'population': population,
                       'area': area})
states



In [20]:

    
states.index









    Out[20]:





Index([u'California', u'Florida', u'Illinois', u'New York', u'Texas'], dtype='object')



In [21]:

    
states.columns









    Out[21]:





Index([u'area', u'population'], dtype='object')



In [22]:

    
states['area']









    Out[22]:





California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64



In [23]:

    
pd.DataFrame(population, columns=['population'])









    Out[23]:






  
    
      
      population
    
  
  
    
      California
      38332521
    
    
      Florida
      19552860
    
    
      Illinois
      12882135
    
    
      New York
      19651127
    
    
      Texas
      26448193



In [24]:

    
data = [{'a': i, 'b': 2 * i} for i in range(3)]
pd.DataFrame(data)



In [25]:

    
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])



In [26]:

    
pd.DataFrame({'population': population, 'area': area})



In [27]:

    
pd.DataFrame(np.random.rand(3, 2),
             columns=['foo', 'bar'],
             index=['a', 'b', 'c'])



In [28]:

    
pd.DataFrame(np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')]))



In [29]:

    
ind = pd.Index([2, 3, 5, 7, 11])
print ind
print ind[1]
print ind[::2]
# cannot ind[1] = 0, ind is immutable
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])
print indA & indB
print indA | indB
print indA ^ indB









    



Int64Index([2, 3, 5, 7, 11], dtype='int64')
3
Int64Index([2, 5, 11], dtype='int64')
Int64Index([3, 5, 7], dtype='int64')
Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')
Int64Index([1, 2, 9, 11], dtype='int64')



In [30]:

    
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print data['b']
print 'a' in data
print data.keys()
print list(data.iteritems())
data['e'] = 1.25
print data









    



0.5
True
Index([u'a', u'b', u'c', u'd'], dtype='object')
[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]
a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64



In [31]:

    
print data['a':'c']
print data[0:2]
print data[(data > 0.3) & (data < 0.8)]
print data[['a', 'e']]









    



a    0.25
b    0.50
c    0.75
dtype: float64
a    0.25
b    0.50
dtype: float64
b    0.50
c    0.75
dtype: float64
a    0.25
e    1.25
dtype: float64



In [32]:

    
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])

# loc attribute allows indexing and slicing that always references the explicit index
print data.loc[1]
print data.loc[1:3]
print
# iloc attribute allows indexing and slicing that always references the implicit Python-style index
print data.iloc[1]
print data.iloc[1:3]









    



a
1    a
3    b
dtype: object

b
3    b
5    c
dtype: object



In [33]:

    
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data



In [34]:

    
print data['area']
print data.area
print data['area'] is data.area
print data['pop'] is data.pop  # DataFrame has a pop() method, so data.pop will point to this method









    



California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64
California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64
True
False



In [35]:

    
data['density'] = data['pop'] / data['area']
data



In [36]:

    
data.values









    Out[36]:





array([[  4.23967000e+05,   3.83325210e+07,   9.04139261e+01],
       [  1.70312000e+05,   1.95528600e+07,   1.14806121e+02],
       [  1.49995000e+05,   1.28821350e+07,   8.58837628e+01],
       [  1.41297000e+05,   1.96511270e+07,   1.39076746e+02],
       [  6.95662000e+05,   2.64481930e+07,   3.80187404e+01]])



In [37]:

    
data.T









    Out[37]:






  
    
      
      California
      Florida
      Illinois
      New York
      Texas
    
  
  
    
      area
      4.239670e+05
      1.703120e+05
      1.499950e+05
      1.412970e+05
      6.956620e+05
    
    
      pop
      3.833252e+07
      1.955286e+07
      1.288214e+07
      1.965113e+07
      2.644819e+07
    
    
      density
      9.041393e+01
      1.148061e+02
      8.588376e+01
      1.390767e+02
      3.801874e+01



In [38]:

    
data.values[0]









    Out[38]:





array([  4.23967000e+05,   3.83325210e+07,   9.04139261e+01])



In [39]:

    
data['area']









    Out[39]:





California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64



In [40]:

    
data.iloc[:3, :2]



In [41]:

    
data.loc[:'Illinois', :'pop']



In [42]:

    
# The ix indexer allows a hybrid of these two approaches
data.ix[:3, :'pop']



In [43]:

    
data.loc[data.density > 100, ['pop', 'density']]



In [44]:

    
print data['Florida':'Illinois']
print data[1:3]
print data[data.density > 100]









    



            area       pop     density
Florida   170312  19552860  114.806121
Illinois  149995  12882135   85.883763
            area       pop     density
Florida   170312  19552860  114.806121
Illinois  149995  12882135   85.883763
            area       pop     density
Florida   170312  19552860  114.806121
New York  141297  19651127  139.076746



In [45]:

    
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser









    Out[45]:





0    6
1    3
2    7
3    4
dtype: int64



In [46]:

    
df = pd.DataFrame(rng.randint(0, 10, (3, 4)), columns=['A', 'B', 'C', 'D'])
df



In [47]:

    
np.exp(ser)









    Out[47]:





0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64



In [48]:

    
np.sin(df * np.pi / 4)









    Out[48]:






  
    
      
      A
      B
      C
      D
    
  
  
    
      0
      -1.000000
      7.071068e-01
      1.000000
      -1.000000e+00
    
    
      1
      -0.707107
      1.224647e-16
      0.707107
      -7.071068e-01
    
    
      2
      -0.707107
      1.000000e+00
      -0.707107
      1.224647e-16



In [49]:

    
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')
population / area









    Out[49]:





Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64



In [50]:

    
area.index | population.index









    Out[50]:





Index([u'Alaska', u'California', u'New York', u'Texas'], dtype='object')



In [51]:

    
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A+B









    Out[51]:





0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64



In [52]:

    
A.add(B, fill_value=0)









    Out[52]:





0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64



In [53]:

    
A = pd.DataFrame(rng.randint(0, 20, (2, 2)), columns=list('AB'))
A



In [54]:

    
B = pd.DataFrame(rng.randint(0, 10, (3, 3)), columns=list('BAC'))
B



In [55]:

    
A + B



In [56]:

    
A.stack()









    Out[56]:





0  A     1
   B    11
1  A     5
   B     1
dtype: int64



In [57]:

    
A.stack().mean()









    Out[57]:





4.5



In [58]:

    
A.add(B, fill_value=A.stack().mean())



In [59]:

    
A = rng.randint(10, size=(3, 4))
df = pd.DataFrame(A, columns=list('QRST'))
df - df.iloc[0]



In [60]:

    
df.subtract(df['R'], axis=0)



In [61]:

    
halfrow = df.iloc[0, ::2]
halfrow









    Out[61]:





Q    3
S    2
Name: 0, dtype: int64



In [62]:

    
df - halfrow



In [63]:

    
vals1 = np.array([1, None, 3, 4])
vals1









    Out[63]:





array([1, None, 3, 4], dtype=object)



In [64]:

    
# This dtype=object means that the best common type representation NumPy could infer 
# for the contents of the array is that they are Python objects. While this kind of 
# object array is useful for some purposes, any operations on the data will be done 
# at the Python level, with much more overhead than the typically fast operations seen 
# for arrays with native types
for dtype in ['object', 'int']:
    print("dtype =", dtype)
    %timeit np.arange(1E6, dtype=dtype).sum()
    print()









    



('dtype =', 'object')
10 loops, best of 3: 32.8 ms per loop
()
('dtype =', 'int')
1000 loops, best of 3: 808 µs per loop
()



In [65]:

    
vals2 = np.array([1, np.nan, 3, 4])
vals2.dtype









    Out[65]:





dtype('float64')



In [66]:

    
print 1 + np.nan
print 0 *  np.nan
print vals2.sum(), vals2.min(), vals2.max()









    



nan
nan
nan nan nan



In [67]:

    
print np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)









    



8.0 1.0 4.0



In [68]:

    
print pd.Series([1, np.nan, 2, None])
x = pd.Series(range(2), dtype=int)
print x
x[0] = None
print x









    



0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64
0    0
1    1
dtype: int64
0    NaN
1    1.0
dtype: float64



In [69]:

    
data = pd.Series([1, np.nan, 'hello', None])
print data.isnull()
print data.notnull()









    



0    False
1     True
2    False
3     True
dtype: bool
0     True
1    False
2     True
3    False
dtype: bool



In [70]:

    
data.dropna()









    Out[70]:





0        1
2    hello
dtype: object



In [71]:

    
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])
df



In [72]:

    
df.dropna()



In [73]:

    
df.dropna(axis='columns')



In [74]:

    
df[3] = np.nan
df



In [75]:

    
df.dropna(axis='columns', how='all')



In [76]:

    
df.dropna(axis='rows', thresh=3)  # rows with at least 3 non-nan vals



In [77]:

    
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data









    Out[77]:





a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64



In [78]:

    
data.fillna(0)









    Out[78]:





a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64



In [79]:

    
# forward-fill
data.fillna(method='ffill')









    Out[79]:





a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64



In [80]:

    
# backward-fill
data.fillna(method='bfill')









    Out[80]:





a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64



In [81]:

    
df



In [82]:

    
df.fillna(method='ffill', axis=1)



In [83]:

    
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
pop = pd.Series(populations, index=index)
pop









    Out[83]:





(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64



In [84]:

    
pop[('California', 2010):('Texas', 2000)]









    Out[84]:





(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64



In [85]:

    
pop[[i for i in pop.index if i[1] == 2010]]









    Out[85]:





(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64



In [86]:

    
index = pd.MultiIndex.from_tuples(index)
index









    Out[86]:





MultiIndex(levels=[[u'California', u'New York', u'Texas'], [2000, 2010]],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])



In [87]:

    
pop = pop.reindex(index)
pop









    Out[87]:





California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64



In [88]:

    
pop[:, 2010]









    Out[88]:





California    37253956
New York      19378102
Texas         25145561
dtype: int64



In [89]:

    
pop_df = pop.unstack()
pop_df



In [90]:

    
pop_df.stack()









    Out[90]:





California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64



In [91]:

    
pop_df = pd.DataFrame({'total': pop,
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})
pop_df



In [92]:

    
f_u18 = pop_df['under18'] / pop_df['total']
f_u18.unstack()



In [93]:

    
df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['data1', 'data2'])
df



In [94]:

    
data = {('California', 2000): 33871648,
        ('California', 2010): 37253956,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561,
        ('New York', 2000): 18976457,
        ('New York', 2010): 19378102}
pd.Series(data)









    Out[94]:





California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64



In [95]:

    
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])









    Out[95]:





MultiIndex(levels=[[u'a', u'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])



In [96]:

    
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])









    Out[96]:





MultiIndex(levels=[[u'a', u'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])



In [97]:

    
pd.MultiIndex.from_product([['a', 'b'], [1, 2]])









    Out[97]:





MultiIndex(levels=[[u'a', u'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])



In [98]:

    
pd.MultiIndex(levels=[['a', 'b'], [1, 2]],
              labels=[[0, 0, 1, 1], [0, 1, 0, 1]])









    Out[98]:





MultiIndex(levels=[[u'a', u'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])



In [99]:

    
pop.index.names = ['state', 'year']
pop









    Out[99]:





state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64



In [100]:

    
# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])
# mock some data
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
# create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data



In [101]:

    
health_data['Guido']



In [102]:

    
pop









    Out[102]:





state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64



In [103]:

    
pop['California', 2000]









    Out[103]:





33871648



In [104]:

    
pop['California']









    Out[104]:





year
2000    33871648
2010    37253956
dtype: int64



In [105]:

    
pop.loc['California':'New York']









    Out[105]:





state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64



In [106]:

    
pop[:, 2000]









    Out[106]:





state
California    33871648
New York      18976457
Texas         20851820
dtype: int64



In [107]:

    
pop[pop > 22000000]









    Out[107]:





state       year
California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64



In [108]:

    
pop[['California', 'Texas']]









    Out[108]:





state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64



In [109]:

    
health_data



In [113]:

    
health_data['Guido', 'HR']









    Out[113]:





year  visit
2013  1        24.0
      2        42.0
2014  1        38.0
      2        19.0
Name: (Guido, HR), dtype: float64



In [111]:

    
health_data.iloc[:2, :2]



In [114]:

    
health_data.loc[:, ('Bob', 'HR')]









    Out[114]:





year  visit
2013  1        27.0
      2        61.0
2014  1        32.0
      2        37.0
Name: (Bob, HR), dtype: float64



In [115]:

    
idx = pd.IndexSlice
health_data.loc[idx[:, 1], idx[:, 'HR']]



In [116]:

    
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data









    Out[116]:





char  int
a     1      0.217673
      2      0.911464
c     1      0.453282
      2      0.098445
b     1      0.290294
      2      0.817984
dtype: float64



In [117]:

    
try: 
    data['a':'b']
except KeyError as e: 
    print(type(e))
    print(e)









    



<type 'exceptions.KeyError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'



In [118]:

    
data = data.sort_index()
data









    Out[118]:





char  int
a     1      0.217673
      2      0.911464
b     1      0.290294
      2      0.817984
c     1      0.453282
      2      0.098445
dtype: float64



In [119]:

    
data['a':'b']









    Out[119]:





char  int
a     1      0.217673
      2      0.911464
b     1      0.290294
      2      0.817984
dtype: float64



In [120]:

    
pop.unstack(level=0)



In [121]:

    
pop.unstack(level=1)



In [122]:

    
pop.unstack().stack()









    Out[122]:





state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64



In [123]:

    
# optionally specify the name of the data for the column representation
pop_flat = pop.reset_index(name='population')
pop_flat



In [124]:

    
pop_flat.set_index(['state', 'year'])



In [125]:

    
health_data



In [126]:

    
data_mean = health_data.mean(level='year')
data_mean



In [132]:

    
data_mean.mean(axis=1, level='type')



In [1]:

    
import numpy as np
import pandas as pd
def make_df(cols, ind): 
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind] for c in cols}
    return pd.DataFrame(data, ind) 
# example DataFrame
make_df('ABC', range(3))



In [2]:

    
x=[1,2,3]
y=[4,5,6]
z=[7,8,9]
np.concatenate([x, y, z])









    Out[2]:





array([1, 2, 3, 4, 5, 6, 7, 8, 9])



In [3]:

    
x = [[1, 2],
     [3, 4]]
np.concatenate([x, x], axis=1)









    Out[3]:





array([[1, 2, 1, 2],
       [3, 4, 3, 4]])



In [4]:

    
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])









    Out[4]:





1    A
2    B
3    C
4    D
5    E
6    F
dtype: object



In [5]:

    
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
print(df1)
print(df2)
print(pd.concat([df1, df2]))



In [6]:

    
df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])
print(df3)
print(df4)
print(pd.concat([df3, df4], axis='col'))









    



    A   B
0  A0  B0
1  A1  B1
    C   D
0  C0  D0
1  C1  D1
    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1



In [7]:

    
x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])
y.index = x.index # make duplicate indices!
print(x)
print(y)
print(pd.concat([x, y]))



In [8]:

    
try:
    # verify_integrity=True will disable duplicate indices
    pd.concat([x, y], verify_integrity=True)
except ValueError as e:
    print("ValueError:", e)









    



('ValueError:', ValueError('Indexes have overlapping values: [0, 1]',))



In [9]:

    
print(x)
print(y)
print(pd.concat([x, y], ignore_index=True))



In [10]:

    
print(x)
print(y)
print(pd.concat([x, y], keys=['x', 'y']))









    



    A   B
0  A0  B0
1  A1  B1
    A   B
0  A2  B2
1  A3  B3
      A   B
x 0  A0  B0
  1  A1  B1
y 0  A2  B2
  1  A3  B3



In [11]:

    
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
print(df5)
print(df6)
print(pd.concat([df5, df6]))









    



    A   B   C
1  A1  B1  C1
2  A2  B2  C2
    B   C   D
3  B3  C3  D3
4  B4  C4  D4
     A   B   C    D
1   A1  B1  C1  NaN
2   A2  B2  C2  NaN
3  NaN  B3  C3   D3
4  NaN  B4  C4   D4



In [12]:

    
print(df5)
print(df6)
print(pd.concat([df5, df6], join_axes=[df5.columns]))









    



    A   B   C
1  A1  B1  C1
2  A2  B2  C2
    B   C   D
3  B3  C3  D3
4  B4  C4  D4
     A   B   C
1   A1  B1  C1
2   A2  B2  C2
3  NaN  B3  C3
4  NaN  B4  C4



In [13]:

    
print(df1)
print(df2)
print(df1.append(df2))  # return a copy



In [14]:

    
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})
print(df1)
print(df2)









    



  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
  employee  hire_date
0     Lisa       2004
1      Bob       2008
2     Jake       2012
3      Sue       2014



In [15]:

    
df3 = pd.merge(df1, df2)
df3









    Out[15]:






  
    
      
      employee
      group
      hire_date
    
  
  
    
      0
      Bob
      Accounting
      2008
    
    
      1
      Jake
      Engineering
      2012
    
    
      2
      Lisa
      Engineering
      2004
    
    
      3
      Sue
      HR
      2014



In [16]:

    
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                    'supervisor': ['Carly', 'Guido', 'Steve']})
print(df3)
print(df4)
print(pd.merge(df3, df4))









    



  employee        group  hire_date
0      Bob   Accounting       2008
1     Jake  Engineering       2012
2     Lisa  Engineering       2004
3      Sue           HR       2014
         group supervisor
0   Accounting      Carly
1  Engineering      Guido
2           HR      Steve
  employee        group  hire_date supervisor
0      Bob   Accounting       2008      Carly
1     Jake  Engineering       2012      Guido
2     Lisa  Engineering       2004      Guido
3      Sue           HR       2014      Steve



In [17]:

    
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting',
                              'Engineering', 'Engineering', 'HR', 'HR'],
                    'skills': ['math', 'spreadsheets', 'coding', 'linux',
                               'spreadsheets', 'organization']})
print(df1)
print(df5)
print(pd.merge(df1, df5))









    



  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
         group        skills
0   Accounting          math
1   Accounting  spreadsheets
2  Engineering        coding
3  Engineering         linux
4           HR  spreadsheets
5           HR  organization
  employee        group        skills
0      Bob   Accounting          math
1      Bob   Accounting  spreadsheets
2     Jake  Engineering        coding
3     Jake  Engineering         linux
4     Lisa  Engineering        coding
5     Lisa  Engineering         linux
6      Sue           HR  spreadsheets
7      Sue           HR  organization



In [18]:

    
print(df1)
print(df2)
print(pd.merge(df1, df2, on='employee'))









    



  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
  employee  hire_date
0     Lisa       2004
1      Bob       2008
2     Jake       2012
3      Sue       2014
  employee        group  hire_date
0      Bob   Accounting       2008
1     Jake  Engineering       2012
2     Lisa  Engineering       2004
3      Sue           HR       2014



In [19]:

    
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'salary': [70000, 80000, 120000, 90000]})
print(df1)
print(df3)
print(pd.merge(df1, df3, left_on="employee", right_on="name"))









    



  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
   name  salary
0   Bob   70000
1  Jake   80000
2  Lisa  120000
3   Sue   90000
  employee        group  name  salary
0      Bob   Accounting   Bob   70000
1     Jake  Engineering  Jake   80000
2     Lisa  Engineering  Lisa  120000
3      Sue           HR   Sue   90000



In [20]:

    
pd.merge(df1, df3, left_on="employee", right_on="name").drop('name', axis=1)









    Out[20]:






  
    
      
      employee
      group
      salary
    
  
  
    
      0
      Bob
      Accounting
      70000
    
    
      1
      Jake
      Engineering
      80000
    
    
      2
      Lisa
      Engineering
      120000
    
    
      3
      Sue
      HR
      90000



In [21]:

    
df1a = df1.set_index('employee')
df2a = df2.set_index('employee')
print(df1a)
print(df2a)









    



                group
employee             
Bob        Accounting
Jake      Engineering
Lisa      Engineering
Sue                HR
          hire_date
employee           
Lisa           2004
Bob            2008
Jake           2012
Sue            2014



In [22]:

    
print(df1a)
print(df2a)
print(pd.merge(df1a, df2a, left_index=True, right_index=True))









    



                group
employee             
Bob        Accounting
Jake      Engineering
Lisa      Engineering
Sue                HR
          hire_date
employee           
Lisa           2004
Bob            2008
Jake           2012
Sue            2014
                group  hire_date
employee                        
Lisa      Engineering       2004
Bob        Accounting       2008
Jake      Engineering       2012
Sue                HR       2014



In [23]:

    
print(df1a)
print(df2a)
# For convenience, DataFrames implement the join() method
# which performs a merge that defaults to joining on indices:
print(df1a.join(df2a))









    



                group
employee             
Bob        Accounting
Jake      Engineering
Lisa      Engineering
Sue                HR
          hire_date
employee           
Lisa           2004
Bob            2008
Jake           2012
Sue            2014
                group  hire_date
employee                        
Bob        Accounting       2008
Jake      Engineering       2012
Lisa      Engineering       2004
Sue                HR       2014



In [24]:

    
print(df1a)
print(df3)
print(pd.merge(df1a, df3, left_index=True, right_on='name'))









    



                group
employee             
Bob        Accounting
Jake      Engineering
Lisa      Engineering
Sue                HR
   name  salary
0   Bob   70000
1  Jake   80000
2  Lisa  120000
3   Sue   90000
         group  name  salary
0   Accounting   Bob   70000
1  Engineering  Jake   80000
2  Engineering  Lisa  120000
3           HR   Sue   90000



In [25]:

    
df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],
                    'food': ['fish', 'beans', 'bread']},
                    columns=['name', 'food'])
df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],
                    'drink': ['wine', 'beer']},
                    columns=['name', 'drink'])
print(df6)
print(df7)
print(pd.merge(df6, df7))









    



    name   food
0  Peter   fish
1   Paul  beans
2   Mary  bread
     name drink
0    Mary  wine
1  Joseph  beer
   name   food drink
0  Mary  bread  wine



In [26]:

    
pd.merge(df6, df7, how='inner')









    Out[26]:






  
    
      
      name
      food
      drink
    
  
  
    
      0
      Mary
      bread
      wine



In [27]:

    
pd.merge(df6, df7, how='outer')









    Out[27]:






  
    
      
      name
      food
      drink
    
  
  
    
      0
      Peter
      fish
      NaN
    
    
      1
      Paul
      beans
      NaN
    
    
      2
      Mary
      bread
      wine
    
    
      3
      Joseph
      NaN
      beer



In [28]:

    
pd.merge(df6, df7, how='left')  # Using how='right' works in a similar manner.









    Out[28]:






  
    
      
      name
      food
      drink
    
  
  
    
      0
      Peter
      fish
      NaN
    
    
      1
      Paul
      beans
      NaN
    
    
      2
      Mary
      bread
      wine



In [29]:

    
df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [1, 2, 3, 4]})
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [3, 1, 4, 2]})
print(df8)
print(df9)
print(pd.merge(df8, df9, on="name"))









    



   name  rank
0   Bob     1
1  Jake     2
2  Lisa     3
3   Sue     4
   name  rank
0   Bob     3
1  Jake     1
2  Lisa     4
3   Sue     2
   name  rank_x  rank_y
0   Bob       1       3
1  Jake       2       1
2  Lisa       3       4
3   Sue       4       2



In [30]:

    
print(pd.merge(df8, df9, on="name", suffixes=["_L", "_R"]))









    



   name  rank_L  rank_R
0   Bob       1       3
1  Jake       2       1
2  Lisa       3       4
3   Sue       4       2



In [31]:

    
# !curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv
# !curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-areas.csv
# !curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-abbrevs.csv









    



  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 57935  100 57935    0     0   7628      0  0:00:07  0:00:07 --:--:--  9950
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   835  100   835    0     0    981      0 --:--:-- --:--:-- --:--:--   983
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   872  100   872    0     0    369      0  0:00:02  0:00:02 --:--:--   369



In [32]:

    
pop = pd.read_csv('state-population.csv')
areas = pd.read_csv('state-areas.csv')
abbrevs = pd.read_csv('state-abbrevs.csv')
print(pop.head())
print(areas.head())
print(abbrevs.head())









    



  state/region     ages  year  population
0           AL  under18  2012   1117489.0
1           AL    total  2012   4817528.0
2           AL  under18  2010   1130966.0
3           AL    total  2010   4785570.0
4           AL  under18  2011   1125763.0
        state  area (sq. mi)
0     Alabama          52423
1      Alaska         656425
2     Arizona         114006
3    Arkansas          53182
4  California         163707
        state abbreviation
0     Alabama           AL
1      Alaska           AK
2     Arizona           AZ
3    Arkansas           AR
4  California           CA



In [34]:

    
merged = pd.merge(pop, abbrevs, how='outer',
                  left_on='state/region', right_on='abbreviation')
merged = merged.drop('abbreviation', 1) # drop duplicate info merged.head()
merged.head()









    Out[34]:






  
    
      
      state/region
      ages
      year
      population
      state
    
  
  
    
      0
      AL
      under18
      2012
      1117489.0
      Alabama
    
    
      1
      AL
      total
      2012
      4817528.0
      Alabama
    
    
      2
      AL
      under18
      2010
      1130966.0
      Alabama
    
    
      3
      AL
      total
      2010
      4785570.0
      Alabama
    
    
      4
      AL
      under18
      2011
      1125763.0
      Alabama



In [35]:

    
merged.isnull().any()









    Out[35]:





state/region    False
ages            False
year            False
population       True
state            True
dtype: bool



In [36]:

    
merged[merged['population'].isnull()].head()









    Out[36]:






  
    
      
      state/region
      ages
      year
      population
      state
    
  
  
    
      2448
      PR
      under18
      1990
      NaN
      NaN
    
    
      2449
      PR
      total
      1990
      NaN
      NaN
    
    
      2450
      PR
      total
      1991
      NaN
      NaN
    
    
      2451
      PR
      under18
      1991
      NaN
      NaN
    
    
      2452
      PR
      total
      1993
      NaN
      NaN



In [37]:

    
merged.loc[merged['state'].isnull(), 'state/region'].unique()









    Out[37]:





array(['PR', 'USA'], dtype=object)



In [38]:

    
merged.loc[merged['state/region'] == 'PR', 'state'] = 'Puerto Rico'
merged.loc[merged['state/region'] == 'USA', 'state'] = 'United States'
merged.isnull().any()









    Out[38]:





state/region    False
ages            False
year            False
population       True
state           False
dtype: bool



In [41]:

    
final = pd.merge(merged, areas, on='state', how='left')
final.head()









    Out[41]:






  
    
      
      state/region
      ages
      year
      population
      state
      area (sq. mi)
    
  
  
    
      0
      AL
      under18
      2012
      1117489.0
      Alabama
      52423.0
    
    
      1
      AL
      total
      2012
      4817528.0
      Alabama
      52423.0
    
    
      2
      AL
      under18
      2010
      1130966.0
      Alabama
      52423.0
    
    
      3
      AL
      total
      2010
      4785570.0
      Alabama
      52423.0
    
    
      4
      AL
      under18
      2011
      1125763.0
      Alabama
      52423.0



In [42]:

    
final.isnull().any()









    Out[42]:





state/region     False
ages             False
year             False
population        True
state            False
area (sq. mi)     True
dtype: bool



In [43]:

    
final['state'][final['area (sq. mi)'].isnull()].unique()









    Out[43]:





array(['United States'], dtype=object)



In [44]:

    
final.dropna(inplace=True)
final.head()









    Out[44]:






  
    
      
      state/region
      ages
      year
      population
      state
      area (sq. mi)
    
  
  
    
      0
      AL
      under18
      2012
      1117489.0
      Alabama
      52423.0
    
    
      1
      AL
      total
      2012
      4817528.0
      Alabama
      52423.0
    
    
      2
      AL
      under18
      2010
      1130966.0
      Alabama
      52423.0
    
    
      3
      AL
      total
      2010
      4785570.0
      Alabama
      52423.0
    
    
      4
      AL
      under18
      2011
      1125763.0
      Alabama
      52423.0



In [45]:

    
final['area (sq. mi)'].isnull().any()









    Out[45]:





False



In [46]:

    
data2010 = final.query("year == 2010 & ages == 'total'")
data2010.head()









    Out[46]:






  
    
      
      state/region
      ages
      year
      population
      state
      area (sq. mi)
    
  
  
    
      3
      AL
      total
      2010
      4785570.0
      Alabama
      52423.0
    
    
      91
      AK
      total
      2010
      713868.0
      Alaska
      656425.0
    
    
      101
      AZ
      total
      2010
      6408790.0
      Arizona
      114006.0
    
    
      189
      AR
      total
      2010
      2922280.0
      Arkansas
      53182.0
    
    
      197
      CA
      total
      2010
      37333601.0
      California
      163707.0



In [47]:

    
data2010.set_index('state', inplace=True)
density = data2010['population'] / data2010['area (sq. mi)']
density.sort_values(ascending=False, inplace=True)
density.head()









    Out[47]:





state
District of Columbia    8898.897059
Puerto Rico             1058.665149
New Jersey              1009.253268
Rhode Island             681.339159
Connecticut              645.600649
dtype: float64



In [48]:

    
density.tail()









    Out[48]:





state
South Dakota    10.583512
North Dakota     9.537565
Montana          6.736171
Wyoming          5.768079
Alaska           1.087509
dtype: float64



In [1]:

    
import numpy as np
import pandas as pd
import seaborn as sns
planets = sns.load_dataset('planets')
planets.shape









    Out[1]:





(1035, 6)



In [2]:

    
planets.head()









    Out[2]:






  
    
      
      method
      number
      orbital_period
      mass
      distance
      year
    
  
  
    
      0
      Radial Velocity
      1
      269.300
      7.10
      77.40
      2006
    
    
      1
      Radial Velocity
      1
      874.774
      2.21
      56.95
      2008
    
    
      2
      Radial Velocity
      1
      763.000
      2.60
      19.84
      2011
    
    
      3
      Radial Velocity
      1
      326.030
      19.40
      110.62
      2007
    
    
      4
      Radial Velocity
      1
      516.220
      10.50
      119.47
      2009



In [3]:

    
rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
ser









    Out[3]:





0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64



In [4]:

    
ser.mean(), ser.sum()









    Out[4]:





(0.5623850983416314, 2.811925491708157)



In [5]:

    
df = pd.DataFrame({'A': rng.rand(5), 'B': rng.rand(5)})
df



In [6]:

    
df.mean()









    Out[6]:





A    0.477888
B    0.443420
dtype: float64



In [7]:

    
df.mean(axis='columns')









    Out[7]:





0    0.088290
1    0.513997
2    0.849309
3    0.406727
4    0.444949
dtype: float64



In [8]:

    
planets.dropna().describe()









    Out[8]:






  
    
      
      number
      orbital_period
      mass
      distance
      year
    
  
  
    
      count
      498.00000
      498.000000
      498.000000
      498.000000
      498.000000
    
    
      mean
      1.73494
      835.778671
      2.509320
      52.068213
      2007.377510
    
    
      std
      1.17572
      1469.128259
      3.636274
      46.596041
      4.167284
    
    
      min
      1.00000
      1.328300
      0.003600
      1.350000
      1989.000000
    
    
      25%
      1.00000
      38.272250
      0.212500
      24.497500
      2005.000000
    
    
      50%
      1.00000
      357.000000
      1.245000
      39.940000
      2009.000000
    
    
      75%
      2.00000
      999.600000
      2.867500
      59.332500
      2011.000000
    
    
      max
      6.00000
      17337.500000
      25.000000
      354.000000
      2014.000000



In [10]:

    
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data': range(6)}, columns=['key', 'data'])
df



In [11]:

    
df.groupby('key').sum()



In [12]:

    
planets.groupby('method')









    Out[12]:





<pandas.core.groupby.DataFrameGroupBy object at 0x116cc4750>



In [13]:

    
planets.groupby('method')['orbital_period']









    Out[13]:





<pandas.core.groupby.SeriesGroupBy object at 0x116cc4190>



In [14]:

    
planets.groupby('method')['orbital_period'].median()









    Out[14]:





method
Astrometry                         631.180000
Eclipse Timing Variations         4343.500000
Imaging                          27500.000000
Microlensing                      3300.000000
Orbital Brightness Modulation        0.342887
Pulsar Timing                       66.541900
Pulsation Timing Variations       1170.000000
Radial Velocity                    360.200000
Transit                              5.714932
Transit Timing Variations           57.011000
Name: orbital_period, dtype: float64



In [15]:

    
for (method, group) in planets.groupby('method'):
    print("{0:30s} shape={1}".format(method, group.shape))









    



Astrometry                     shape=(2, 6)
Eclipse Timing Variations      shape=(9, 6)
Imaging                        shape=(38, 6)
Microlensing                   shape=(23, 6)
Orbital Brightness Modulation  shape=(3, 6)
Pulsar Timing                  shape=(5, 6)
Pulsation Timing Variations    shape=(1, 6)
Radial Velocity                shape=(553, 6)
Transit                        shape=(397, 6)
Transit Timing Variations      shape=(4, 6)



In [17]:

    
planets.groupby('method')['year'].describe().unstack()









    Out[17]:






  
    
      
      count
      mean
      std
      min
      25%
      50%
      75%
      max
    
    
      method
      
      
      
      
      
      
      
      
    
  
  
    
      Astrometry
      2.0
      2011.500000
      2.121320
      2010.0
      2010.75
      2011.5
      2012.25
      2013.0
    
    
      Eclipse Timing Variations
      9.0
      2010.000000
      1.414214
      2008.0
      2009.00
      2010.0
      2011.00
      2012.0
    
    
      Imaging
      38.0
      2009.131579
      2.781901
      2004.0
      2008.00
      2009.0
      2011.00
      2013.0
    
    
      Microlensing
      23.0
      2009.782609
      2.859697
      2004.0
      2008.00
      2010.0
      2012.00
      2013.0
    
    
      Orbital Brightness Modulation
      3.0
      2011.666667
      1.154701
      2011.0
      2011.00
      2011.0
      2012.00
      2013.0
    
    
      Pulsar Timing
      5.0
      1998.400000
      8.384510
      1992.0
      1992.00
      1994.0
      2003.00
      2011.0
    
    
      Pulsation Timing Variations
      1.0
      2007.000000
      NaN
      2007.0
      2007.00
      2007.0
      2007.00
      2007.0
    
    
      Radial Velocity
      553.0
      2007.518987
      4.249052
      1989.0
      2005.00
      2009.0
      2011.00
      2014.0
    
    
      Transit
      397.0
      2011.236776
      2.077867
      2002.0
      2010.00
      2012.0
      2013.00
      2014.0
    
    
      Transit Timing Variations
      4.0
      2012.500000
      1.290994
      2011.0
      2011.75
      2012.5
      2013.25
      2014.0



In [18]:

    
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])
df



In [19]:

    
df.groupby('key').aggregate(['min', np.median, max])



In [20]:

    
df.groupby('key').aggregate({'data1': 'min', 'data2': 'max'})



In [21]:

    
print(df.groupby('key').std())
print(df.groupby('key').filter(lambda x: x['data2'].std() > 4))









    



       data1     data2
key                   
A    2.12132  1.414214
B    2.12132  4.949747
C    2.12132  4.242641
  key  data1  data2
1   B      1      0
2   C      2      3
4   B      4      7
5   C      5      9



In [22]:

    
df.groupby('key').transform(lambda x: x - x.mean())



In [26]:

    
def norm_by_data2(x):
    # x is a DataFrame of group values
    x['data1'] /= x['data2'].sum()
    return x
print(df.groupby('key').apply(norm_by_data2))









    



  key     data1  data2
0   A  0.000000      5
1   B  0.142857      0
2   C  0.166667      3
3   A  0.375000      3
4   B  0.571429      7
5   C  0.416667      9



In [27]:

    
L=[0,1,0,1,2,0]
print(df)
print(df.groupby(L).sum())









    



  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
   data1  data2
0      7     17
1      4      3
2      4      7



In [28]:

    
df2 = df.set_index('key')
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}
print(df2)
print(df2.groupby(mapping).sum())









    



     data1  data2
key              
A        0      5
B        1      0
C        2      3
A        3      3
B        4      7
C        5      9
           data1  data2
consonant     12     19
vowel          3      8



In [29]:

    
df2.groupby(str.lower).mean()



In [30]:

    
df2.groupby([str.lower, mapping]).mean()









    Out[30]:






  
    
      
      
      data1
      data2
    
  
  
    
      a
      vowel
      1.5
      4.0
    
    
      b
      consonant
      2.5
      3.5
    
    
      c
      consonant
      3.5
      6.0



In [31]:

    
decade = 10 * (planets['year'] // 10)
decade = decade.astype(str) + 's'
decade.name = 'decade'
planets.groupby(['method', decade])['number'].sum().unstack().fillna(0)









    Out[31]:






  
    
      decade
      1980s
      1990s
      2000s
      2010s
    
    
      method
      
      
      
      
    
  
  
    
      Astrometry
      0.0
      0.0
      0.0
      2.0
    
    
      Eclipse Timing Variations
      0.0
      0.0
      5.0
      10.0
    
    
      Imaging
      0.0
      0.0
      29.0
      21.0
    
    
      Microlensing
      0.0
      0.0
      12.0
      15.0
    
    
      Orbital Brightness Modulation
      0.0
      0.0
      0.0
      5.0
    
    
      Pulsar Timing
      0.0
      9.0
      1.0
      1.0
    
    
      Pulsation Timing Variations
      0.0
      0.0
      1.0
      0.0
    
    
      Radial Velocity
      1.0
      52.0
      475.0
      424.0
    
    
      Transit
      0.0
      0.0
      64.0
      712.0
    
    
      Transit Timing Variations
      0.0
      0.0
      0.0
      9.0



In [42]:

    
import numpy as np
import pandas as pd
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head()









    Out[42]:






  
    
      
      survived
      pclass
      sex
      age
      sibsp
      parch
      fare
      embarked
      class
      who
      adult_male
      deck
      embark_town
      alive
      alone
    
  
  
    
      0
      0
      3
      male
      22.0
      1
      0
      7.2500
      S
      Third
      man
      True
      NaN
      Southampton
      no
      False
    
    
      1
      1
      1
      female
      38.0
      1
      0
      71.2833
      C
      First
      woman
      False
      C
      Cherbourg
      yes
      False
    
    
      2
      1
      3
      female
      26.0
      0
      0
      7.9250
      S
      Third
      woman
      False
      NaN
      Southampton
      yes
      True
    
    
      3
      1
      1
      female
      35.0
      1
      0
      53.1000
      S
      First
      woman
      False
      C
      Southampton
      yes
      False
    
    
      4
      0
      3
      male
      35.0
      0
      0
      8.0500
      S
      Third
      man
      True
      NaN
      Southampton
      no
      True



In [43]:

    
titanic.groupby('sex')['survived'].mean()









    Out[43]:





sex
female    0.742038
male      0.188908
Name: survived, dtype: float64



In [45]:

    
titanic.groupby(['sex', 'class'])['survived'].mean().unstack()



In [46]:

    
titanic.pivot_table('survived', index='sex', columns='class')



In [47]:

    
age = pd.cut(titanic['age'], [0, 18, 80])
titanic.pivot_table('survived', ['sex', age], 'class')



In [50]:

    
fare = pd.qcut(titanic['fare'], 2)  # split into 2 equal-size groups
titanic.pivot_table('survived', ['sex', age], [fare, 'class'])









    Out[50]:






  
    
      
      fare
      [0, 14.454]
      (14.454, 512.329]
    
    
      
      class
      First
      Second
      Third
      First
      Second
      Third
    
    
      sex
      age
      
      
      
      
      
      
    
  
  
    
      female
      (0, 18]
      NaN
      1.000000
      0.714286
      0.909091
      1.000000
      0.318182
    
    
      (18, 80]
      NaN
      0.880000
      0.444444
      0.972973
      0.914286
      0.391304
    
    
      male
      (0, 18]
      NaN
      0.000000
      0.260870
      0.800000
      0.818182
      0.178571
    
    
      (18, 80]
      0.0
      0.098039
      0.125000
      0.391304
      0.030303
      0.192308



In [51]:

    
titanic.pivot_table(index='sex', columns='class',
                    aggfunc={'survived':np.sum, 'fare':'mean'})



In [52]:

    
titanic.pivot_table('survived', index='sex', columns='class', margins=True)



In [53]:

    
# !curl -O https://raw.githubusercontent.com/jakevdp/data-CDCbirths/master/births.csv









    



  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  258k  100  258k    0     0   9195      0  0:00:28  0:00:28 --:--:--  6838



In [54]:

    
births = pd.read_csv('births.csv')
births.head()



In [55]:

    
births['decade'] = 10 * (births['year'] // 10)
births.pivot_table('births', index='decade', columns='gender', aggfunc='sum')



In [56]:

    
%matplotlib inline
import matplotlib.pyplot as plt
sns.set() # use Seaborn styles
births.pivot_table('births', index='year', columns='gender', aggfunc='sum').plot()
plt.ylabel('total births per year')









    Out[56]:





<matplotlib.text.Text at 0x119b83090>



In [57]:

    
quartiles = np.percentile(births['births'], [25, 50, 75])
mu = quartiles[1]
sig = 0.74 * (quartiles[2] - quartiles[0])
births = births.query('(births > @mu - 5 * @sig) & (births < @mu + 5 * @sig)')
births['day'] = births['day'].astype(int)
births.index = pd.to_datetime(10000 * births.year +
                              100 * births.month +
                              births.day, format='%Y%m%d')
births['dayofweek'] = births.index.dayofweek

births.pivot_table('births', index='dayofweek',columns='decade', aggfunc='mean').plot()
plt.gca().set_xticklabels(['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])
plt.ylabel('mean births by day')



In [58]:

    
births_by_date = births.pivot_table('births', [births.index.month, births.index.day])
births_by_date.head()









    Out[58]:





1  1    4009.225
   2    4247.400
   3    4500.900
   4    4571.350
   5    4603.625
Name: births, dtype: float64



In [59]:

    
births_by_date.index = [pd.datetime(2012, month, day) for (month, day) in births_by_date.index]
births_by_date.head()









    Out[59]:





2012-01-01    4009.225
2012-01-02    4247.400
2012-01-03    4500.900
2012-01-04    4571.350
2012-01-05    4603.625
Name: births, dtype: float64



In [60]:

    
fig, ax = plt.subplots(figsize=(12, 4))
births_by_date.plot(ax=ax)









    Out[60]:





<matplotlib.axes._subplots.AxesSubplot at 0x11a211e10>



In [1]:

    
import numpy as np
import pandas as pd
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']
names = pd.Series(data)
names









    Out[1]:





0    peter
1     Paul
2     None
3     MARY
4    gUIDO
dtype: object



In [2]:

    
names.str.capitalize()









    Out[2]:





0    Peter
1     Paul
2     None
3     Mary
4    Guido
dtype: object



In [3]:

    
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
                   'Eric Idle', 'Terry Jones', 'Michael Palin'])
monte.str.lower()









    Out[3]:





0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object



In [4]:

    
monte.str.len()









    Out[4]:





0    14
1    11
2    13
3     9
4    11
5    13
dtype: int64



In [5]:

    
monte.str.startswith('T')









    Out[5]:





0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool



In [6]:

    
monte.str.split()









    Out[6]:





0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object



In [8]:

    
monte.str.extract('([A-Za-z]+)')









    



/Users/kris/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:1: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)
  if __name__ == '__main__':






    Out[8]:





0     Graham
1       John
2      Terry
3       Eric
4      Terry
5    Michael
dtype: object



In [9]:

    
monte.str.findall('^[^AEIOU].*[^aeiou]$')









    Out[9]:





0    [Graham Chapman]
1                  []
2     [Terry Gilliam]
3                  []
4       [Terry Jones]
5     [Michael Palin]
dtype: object



In [10]:

    
monte.str[0:3]









    Out[10]:





0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object



In [11]:

    
monte.str.split().str.get(-1)









    Out[11]:





0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object



In [12]:

    
full_monte = pd.DataFrame({'name': monte, 
                           'info': ['B|C|D', 'B|D', 'A|C', 'B|D', 'B|C', 'B|C|D']})
full_monte









    Out[12]:






  
    
      
      info
      name
    
  
  
    
      0
      B|C|D
      Graham Chapman
    
    
      1
      B|D
      John Cleese
    
    
      2
      A|C
      Terry Gilliam
    
    
      3
      B|D
      Eric Idle
    
    
      4
      B|C
      Terry Jones
    
    
      5
      B|C|D
      Michael Palin



In [13]:

    
full_monte['info'].str.get_dummies('|')



In [14]:

    
# !curl -O http://openrecipes.s3.amazonaws.com/openrecipes.txt









    



  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    20  100    20    0     0      7      0  0:00:02  0:00:02 --:--:--     8



In [16]:

    
# read the entire file into a Python array
with open('openrecipes.txt', 'r') as f:
    # Extract each line
    data = (line.strip() for line in f)
    # Reformat so each line is the element of a list
    data_json = "[{0}]".format(','.join(data))
# read the result as a JSON
recipes = pd.read_json(data_json)
recipes.shape









    Out[16]:





(1042, 9)



In [17]:

    
recipes.head()









    Out[17]:






  
    
      
      cookTime
      datePublished
      description
      image
      ingredients
      name
      prepTime
      recipeYield
      url
    
  
  
    
      0
      PT
      2013-04-01
      Got leftover Easter eggs?    Got leftover East...
      http://static.thepioneerwoman.com/cooking/file...
      12 whole Hard Boiled Eggs\n1/2 cup Mayonnaise\...
      Easter Leftover Sandwich
      PT15M
      8
      http://thepioneerwoman.com/cooking/2013/04/eas...
    
    
      1
      PT10M
      2011-06-06
      I finally have basil in my garden. Basil I can...
      http://static.thepioneerwoman.com/cooking/file...
      3/4 cups Fresh Basil Leaves\n1/2 cup Grated Pa...
      Pasta with Pesto Cream Sauce
      PT6M
      8
      http://thepioneerwoman.com/cooking/2011/06/pas...
    
    
      2
      PT15M
      2011-09-15
      This was yummy. And easy. And pretty! And it t...
      http://static.thepioneerwoman.com/cooking/file...
      2 whole Pork Tenderloins\n Salt And Pepper, to...
      Herb Roasted Pork Tenderloin with Preserves
      PT5M
      12
      http://thepioneerwoman.com/cooking/2011/09/her...
    
    
      3
      PT20M
      2012-04-23
      I made this for a late lunch Saturday, and it ...
      http://static.thepioneerwoman.com/cooking/file...
      1 pound Penne\n4 whole Boneless, Skinless Chic...
      Chicken Florentine Pasta
      PT10M
      10
      http://thepioneerwoman.com/cooking/2012/04/chi...
    
    
      4
      PT
      2011-06-13
      Iced coffee is my life. When I wake up, often ...
      http://static.thepioneerwoman.com/cooking/file...
      1 pound Ground Coffee (good, Rich Roast)\n8 qu...
      Perfect Iced Coffee
      PT8H
      24
      http://thepioneerwoman.com/cooking/2011/06/per...



In [18]:

    
recipes.ingredients.str.len().describe()









    Out[18]:





count    1042.000000
mean      358.645873
std       187.332133
min        22.000000
25%       246.250000
50%       338.000000
75%       440.000000
max      3160.000000
Name: ingredients, dtype: float64



In [19]:

    
recipes.name[np.argmax(recipes.ingredients.str.len())]









    Out[19]:





u'A Nice Berry Pie'



In [20]:

    
recipes.description.str.contains('[Bb]reakfast').sum()









    Out[20]:





11



In [21]:

    
recipes.ingredients.str.contains('[Cc]innamon').sum()









    Out[21]:





79



In [22]:

    
recipes.ingredients.str.contains('[Cc]inamon').sum()









    Out[22]:





0



In [23]:

    
import re
spice_list = ['salt', 'pepper', 'oregano', 'sage', 'parsley',
              'rosemary', 'tarragon', 'thyme', 'paprika', 'cumin']
spice_df = pd.DataFrame(dict((spice, recipes.ingredients.str.contains(spice, re.IGNORECASE)) 
                             for spice in spice_list))
spice_df.head()









    Out[23]:






  
    
      
      cumin
      oregano
      paprika
      parsley
      pepper
      rosemary
      sage
      salt
      tarragon
      thyme
    
  
  
    
      0
      False
      False
      False
      False
      False
      False
      False
      False
      False
      False
    
    
      1
      False
      False
      False
      False
      False
      False
      False
      False
      False
      False
    
    
      2
      False
      False
      False
      False
      False
      False
      False
      False
      False
      False
    
    
      3
      False
      False
      False
      False
      False
      False
      False
      False
      False
      False
    
    
      4
      False
      False
      False
      False
      False
      False
      False
      False
      False
      False



In [26]:

    
spice_df.sum(axis=0)









    Out[26]:





cumin        41
oregano      15
paprika      16
parsley      17
pepper      133
rosemary     12
sage         19
salt        458
tarragon      8
thyme        27
dtype: int64



In [28]:

    
selection = spice_df.query('salt & pepper & cumin')
len(selection)









    Out[28]:





17



In [29]:

    
recipes.name[selection.index]









    Out[29]:





570     Pierce Street Vegetarian Chili Recipe
641                 Sesame Yogurt Pasta Salad
655                                Palak Daal
670          My Favorite Grilled Kabob Recipe
680            Jamaican Veggie Patties Recipe
688                       Tempeh Curry Recipe
690                         Red Posole Recipe
691                        Feisty Green Beans
759         Chile Lime Tequila Popcorn Recipe
812                      Spiced Tomato Gratin
814                      New Year Noodle Soup
817         Winter Vegetable &amp; Tofu Korma
818                    Spiced Coconut Spinach
826          Moroccan Mint Roasted Vegetables
838        Morrocan Carrot and Chickpea Salad
1016                            Dukkah Recipe
1032        Baked Sweet Potato Falafel Recipe
Name: name, dtype: object



In [1]:

    
from datetime import datetime
datetime(year=2015, month=7, day=4)









    Out[1]:





datetime.datetime(2015, 7, 4, 0, 0)



In [2]:

    
from dateutil import parser
date = parser.parse("4th of July, 2015")
date









    Out[2]:





datetime.datetime(2015, 7, 4, 0, 0)



In [3]:

    
date.strftime('%A')









    Out[3]:





'Saturday'



In [4]:

    
import numpy as np
date = np.array('2015-07-04', dtype=np.datetime64)
date









    Out[4]:





array(datetime.date(2015, 7, 4), dtype='datetime64[D]')



In [5]:

    
date + np.arange(12)









    Out[5]:





array(['2015-07-04', '2015-07-05', '2015-07-06', '2015-07-07',
       '2015-07-08', '2015-07-09', '2015-07-10', '2015-07-11',
       '2015-07-12', '2015-07-13', '2015-07-14', '2015-07-15'], dtype='datetime64[D]')



In [6]:

    
np.datetime64('2015-07-04')









    Out[6]:





numpy.datetime64('2015-07-04')



In [7]:

    
np.datetime64('2015-07-04 12:00')









    Out[7]:





numpy.datetime64('2015-07-04T12:00')



In [8]:

    
np.datetime64('2015-07-04 12:59:59.50', 'ns')









    Out[8]:





numpy.datetime64('2015-07-04T12:59:59.500000000')



In [9]:

    
import pandas as pd
date = pd.to_datetime("4th of July, 2015")
date









    Out[9]:





Timestamp('2015-07-04 00:00:00')



In [10]:

    
date.strftime('%A')









    Out[10]:





'Saturday'



In [11]:

    
date + pd.to_timedelta(np.arange(12), 'D')









    Out[11]:





DatetimeIndex(['2015-07-04', '2015-07-05', '2015-07-06', '2015-07-07',
               '2015-07-08', '2015-07-09', '2015-07-10', '2015-07-11',
               '2015-07-12', '2015-07-13', '2015-07-14', '2015-07-15'],
              dtype='datetime64[ns]', freq=None)



In [12]:

    
index = pd.DatetimeIndex(['2014-07-04', '2014-08-04', '2015-07-04', '2015-08-04'])
data = pd.Series([0, 1, 2, 3], index=index)
data









    Out[12]:





2014-07-04    0
2014-08-04    1
2015-07-04    2
2015-08-04    3
dtype: int64



In [13]:

    
data['2014-07-04':'2015-07-04']









    Out[13]:





2014-07-04    0
2014-08-04    1
2015-07-04    2
dtype: int64



In [14]:

    
data['2015']









    Out[14]:





2015-07-04    2
2015-08-04    3
dtype: int64



In [15]:

    
dates = pd.to_datetime([datetime(2015, 7, 3), '4th of July, 2015',
                        '2015-Jul-6', '07-07-2015', '20150708'])
dates









    Out[15]:





DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
               '2015-07-08'],
              dtype='datetime64[ns]', freq=None)



In [16]:

    
dates.to_period('D')









    Out[16]:





PeriodIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
             '2015-07-08'],
            dtype='int64', freq='D')



In [17]:

    
dates - dates[0]









    Out[17]:





TimedeltaIndex(['0 days', '1 days', '3 days', '4 days', '5 days'], dtype='timedelta64[ns]', freq=None)



In [18]:

    
pd.date_range('2015-07-03', '2015-07-10')









    Out[18]:





DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',
               '2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],
              dtype='datetime64[ns]', freq='D')



In [19]:

    
pd.date_range('2015-07-03', periods=8)









    Out[19]:





DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',
               '2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],
              dtype='datetime64[ns]', freq='D')



In [20]:

    
pd.date_range('2015-07-03', periods=8, freq='H')









    Out[20]:





DatetimeIndex(['2015-07-03 00:00:00', '2015-07-03 01:00:00',
               '2015-07-03 02:00:00', '2015-07-03 03:00:00',
               '2015-07-03 04:00:00', '2015-07-03 05:00:00',
               '2015-07-03 06:00:00', '2015-07-03 07:00:00'],
              dtype='datetime64[ns]', freq='H')



In [21]:

    
pd.period_range('2015-07-03', periods=8, freq='M')









    Out[21]:





PeriodIndex(['2015-07', '2015-08', '2015-09', '2015-10', '2015-11', '2015-12',
             '2016-01', '2016-02'],
            dtype='int64', freq='M')



In [22]:

    
pd.timedelta_range(0, periods=10, freq='H')









    Out[22]:





TimedeltaIndex(['00:00:00', '01:00:00', '02:00:00', '03:00:00', '04:00:00',
                '05:00:00', '06:00:00', '07:00:00', '08:00:00', '09:00:00'],
               dtype='timedelta64[ns]', freq='H')



In [23]:

    
pd.timedelta_range(0, periods=8, freq='2H30T')









    Out[23]:





TimedeltaIndex(['00:00:00', '02:30:00', '05:00:00', '07:30:00', '10:00:00',
                '12:30:00', '15:00:00', '17:30:00'],
               dtype='timedelta64[ns]', freq='150T')



In [26]:

    
from pandas.tseries.offsets import BDay
pd.date_range('2015-07-01', periods=10, freq=BDay())









    Out[26]:





DatetimeIndex(['2015-07-01', '2015-07-02', '2015-07-03', '2015-07-06',
               '2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10',
               '2015-07-13', '2015-07-14'],
              dtype='datetime64[ns]', freq='B')



In [1]:

    
# import pandas_datareader as pdr
# goog = data.DataReader('GOOG', start='2004', end='2016', data_source='google')
# goog = pdr.get_data_yahoo('GOOG')
import pandas as pd
goog = pd.read_csv('goog.csv')
goog['Date'] = pd.to_datetime(goog['Date'])
# goog.sort_values(by='Date', inplace=True)
goog.set_index('Date', inplace=True)
goog.sort_index(inplace=True)
goog.head()



In [2]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
goog = goog['Close']
goog.plot()









    Out[2]:





<matplotlib.axes._subplots.AxesSubplot at 0x11676dbd0>



In [3]:

    
goog.plot(alpha=0.5, style='-')
goog.resample('BA').mean().plot(style=':')
goog.asfreq('BA').plot(style='--')
plt.legend(['input', 'resample', 'asfreq'], loc='upper left')









    Out[3]:





<matplotlib.legend.Legend at 0x118e5f7d0>



In [4]:

    
# resample() is fundamentally a data aggregation
goog.resample('BA').mean()
# goog['2014'].mean() == 75.825213









    Out[4]:





Date
2004-12-31     75.825213
2005-12-30    138.741032
2006-12-29    205.387131
2007-12-31    269.105817
2008-12-31    232.193241
2009-12-31    219.624286
2010-12-31    267.477131
2011-12-30    284.206190
2012-12-31    321.086800
2013-12-31    441.678294
2014-12-31    561.328452
2015-12-31    602.461270
2016-12-30           NaN
Freq: BA-DEC, Name: Close, dtype: float64



In [5]:

    
# asfreq() is fundamentally a data selection
goog.asfreq('BA')
# goog['2004-12-31'] == 96.29999..









    Out[5]:





Date
2004-12-31     96.30
2005-12-30    207.22
2006-12-29    230.01
2007-12-31    345.39
2008-12-31    153.67
2009-12-31    309.68
2010-12-31    296.69
2011-12-30    322.63
2012-12-31    353.34
2013-12-31    559.79
2014-12-31    526.40
2015-12-31    758.88
Freq: BA-DEC, Name: Close, dtype: float64



In [6]:

    
fig, ax = plt.subplots(2, sharex=True)
data = goog.iloc[:10]
data.asfreq('D').plot(ax=ax[0], marker='o')
data.asfreq('D', method='bfill').plot(ax=ax[1], style='-o')
data.asfreq('D', method='ffill').plot(ax=ax[1], style='--o')
ax[1].legend(["back-fill", "forward-fill"]);



In [7]:

    
data.asfreq('D')









    Out[7]:





Date
2004-08-19    50.12
2004-08-20    54.10
2004-08-21      NaN
2004-08-22      NaN
2004-08-23    54.65
2004-08-24    52.38
2004-08-25    52.95
2004-08-26    53.90
2004-08-27    53.02
2004-08-28      NaN
2004-08-29      NaN
2004-08-30    50.95
2004-08-31    51.13
2004-09-01    50.07
Freq: D, Name: Close, dtype: float64



In [8]:

    
fig, ax = plt.subplots(3, sharey=True)
# apply a frequency to the data
goog = goog.asfreq('D', method='pad')
goog.plot(ax=ax[0])
goog.shift(900).plot(ax=ax[1])  # shift the data
goog.tshift(900).plot(ax=ax[2])  # shift the date index
# legends and annotations
local_max = pd.to_datetime('2007-11-05')
offset = pd.Timedelta(900, 'D')
ax[0].legend(['input'], loc=2)
ax[0].get_xticklabels()[4].set(weight='heavy', color='red')
ax[0].axvline(local_max, alpha=0.3, color='red')
ax[1].legend(['shift(900)'], loc=2)
ax[1].get_xticklabels()[4].set(weight='heavy', color='red')
ax[1].axvline(local_max + offset, alpha=0.3, color='red')
ax[2].legend(['tshift(900)'], loc=2)
ax[2].get_xticklabels()[1].set(weight='heavy', color='red')
ax[2].axvline(local_max + offset, alpha=0.3, color='red');



In [15]:

    
ROI = 100 * (goog.tshift(-365) / goog - 1)
ROI.plot()
plt.ylabel('% Return on Investment');



In [16]:

    
rolling = goog.rolling(365, center=True)
data = pd.DataFrame({'input': goog,
                     'one-year rolling_mean': rolling.mean(),
                     'one-year rolling_std': rolling.std()})
ax = data.plot(style=['-', '--', ':'])
ax.lines[0].set_alpha(0.3)



In [19]:

    
# !curl -o FremontBridge.csv https://data.seattle.gov/api/views/65db-xm6k/rows.csv?accessType=DOWNLOAD









    



  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  767k    0  767k    0     0   2292      0 --:--:--  0:05:42 --:--:--  3688
curl: (18) transfer closed with outstanding read data remaining



In [20]:

    
data = pd.read_csv('FremontBridge.csv', index_col='Date', parse_dates=True)
data.head()









    Out[20]:






  
    
      
      Fremont Bridge West Sidewalk
      Fremont Bridge East Sidewalk
    
    
      Date
      
      
    
  
  
    
      2012-10-03 00:00:00
      4.0
      9.0
    
    
      2012-10-03 01:00:00
      4.0
      6.0
    
    
      2012-10-03 02:00:00
      1.0
      1.0
    
    
      2012-10-03 03:00:00
      2.0
      3.0
    
    
      2012-10-03 04:00:00
      6.0
      1.0



In [22]:

    
data.columns = ['west', 'east']
data['total'] = data.eval('west + east')
data.dropna().describe()









    Out[22]:






  
    
      
      west
      east
      total
    
  
  
    
      count
      27322.000000
      27322.000000
      27322.000000
    
    
      mean
      56.330210
      53.597906
      109.928117
    
    
      std
      71.597664
      76.477755
      134.082446
    
    
      min
      0.000000
      0.000000
      0.000000
    
    
      25%
      7.000000
      7.000000
      16.000000
    
    
      50%
      32.000000
      28.000000
      63.000000
    
    
      75%
      75.000000
      66.000000
      145.000000
    
    
      max
      698.000000
      667.000000
      946.000000



In [23]:

    
data.plot(alpha=0.5)
plt.ylabel('hourly bicycle count')









    Out[23]:





<matplotlib.text.Text at 0x11b0a8250>



In [25]:

    
weekly = data.resample('W').sum()
weekly.plot(style=[':', '--', '-'])
plt.ylabel('weekly bicycle count')









    Out[25]:





<matplotlib.text.Text at 0x113b895d0>



In [26]:

    
daily = data.resample('D').sum()
daily.rolling(30, center=True).sum().plot(style=[':', '--', '-'])
plt.ylabel('rolling mean hourly count')









    Out[26]:





<matplotlib.text.Text at 0x11ace5590>



In [28]:

    
daily.rolling(50, center=True, win_type='gaussian').sum(std=10).plot(style=[':', '--', '-'])









    Out[28]:





<matplotlib.axes._subplots.AxesSubplot at 0x11c791a90>



In [30]:

    
import numpy as np
by_time = data.groupby(data.index.time).mean()
hourly_ticks = 4 * 60 * 60 * np.arange(6)
by_time.plot(xticks=hourly_ticks, style=[':', '--', '-'])









    Out[30]:





<matplotlib.axes._subplots.AxesSubplot at 0x11c077a10>



In [31]:

    
data.groupby(data.index.time).mean()



In [32]:

    
data.index









    Out[32]:





DatetimeIndex(['2012-10-03 00:00:00', '2012-10-03 01:00:00',
               '2012-10-03 02:00:00', '2012-10-03 03:00:00',
               '2012-10-03 04:00:00', '2012-10-03 05:00:00',
               '2012-10-03 06:00:00', '2012-10-03 07:00:00',
               '2012-10-03 08:00:00', '2012-10-03 09:00:00',
               ...
               '2015-11-15 06:00:00', '2015-11-15 07:00:00',
               '2015-11-15 08:00:00', '2015-11-15 09:00:00',
               '2015-11-15 10:00:00', '2015-11-15 11:00:00',
               '2015-11-15 12:00:00', '2015-11-15 13:00:00',
               '2015-11-15 14:00:00', '2015-11-15 15:00:00'],
              dtype='datetime64[ns]', name=u'Date', length=27328, freq=None)



In [33]:

    
by_weekday = data.groupby(data.index.dayofweek).mean()
by_weekday.index = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
by_weekday.plot(style=[':', '--', '-'])









    Out[33]:





<matplotlib.axes._subplots.AxesSubplot at 0x11d9bc150>



In [35]:

    
weekend = np.where(data.index.weekday < 5, 'Weekday', 'Weekend')
by_time = data.groupby([weekend, data.index.time]).mean()
fig, ax = plt.subplots(1, 2, figsize=(14, 5))
by_time.ix['Weekday'].plot(ax=ax[0], title='weekday', style=[':', '--', '-'], xticks=hourly_ticks)
by_time.ix['Weekend'].plot(ax=ax[1], title='weekend', style=[':', '--', '-'], xticks=hourly_ticks)









    Out[35]:





<matplotlib.axes._subplots.AxesSubplot at 0x11f86d910>



In [36]:

    
by_time



In [1]:

    
import numpy as np
rng = np.random.RandomState(42)
x = rng.rand(1E6)
y = rng.rand(1E6)
%timeit x + y









    



/Users/kris/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  app.launch_new_instance()
/Users/kris/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future






    



1000 loops, best of 3: 1.16 ms per loop



In [2]:

    
%timeit np.fromiter((xi + yi for xi, yi in zip(x, y)), dtype=x.dtype, count=len(x))









    



1 loop, best of 3: 421 ms per loop



In [3]:

    
mask=(x>0.5)&(y<0.5)  # every intermediate step is explicitly allocated in memory

# preferred way
import numexpr
mask_numexpr = numexpr.evaluate('(x > 0.5) & (y < 0.5)')
np.allclose(mask, mask_numexpr)









    Out[3]:





True



In [5]:

    
import pandas as pd
nrows, ncols = 100000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols)) for _ in range(4))



In [6]:

    
%timeit df1 + df2 + df3 + df4









    



10 loops, best of 3: 73.8 ms per loop



In [7]:

    
%timeit pd.eval('df1 + df2 + df3 + df4')









    



10 loops, best of 3: 39.8 ms per loop



In [8]:

    
np.allclose(df1 + df2 + df3 + df4, pd.eval('df1 + df2 + df3 + df4'))









    Out[8]:





True



In [9]:

    
df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0, 1000, (100, 3))) for i in range(5))



In [10]:

    
result1 = -df1 * df2 / (df3 + df4) - df5
result2 = pd.eval('-df1 * df2 / (df3 + df4) - df5')
np.allclose(result1, result2)









    Out[10]:





True



In [11]:

    
result1 = (df1 < df2) & (df2 <= df3) & (df3 != df4)
result2 = pd.eval('df1 < df2 <= df3 != df4')
np.allclose(result1, result2)









    Out[11]:





True



In [12]:

    
result1 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4)
result2 = pd.eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)')
np.allclose(result1, result2)









    Out[12]:





True



In [13]:

    
result3 = pd.eval('(df1 < 0.5) and (df2 < 0.5) or (df3 < df4)')
np.allclose(result1, result3)









    Out[13]:





True



In [14]:

    
result1 = df2.T[0] + df3.iloc[1]
result2 = pd.eval('df2.T[0] + df3.iloc[1]')
np.allclose(result1, result2)









    Out[14]:





True



In [15]:

    
df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])
df.head()



In [16]:

    
result1 = (df['A'] + df['B']) / (df['C'] - 1)
result2 = pd.eval("(df.A + df.B) / (df.C - 1)")
np.allclose(result1, result2)









    Out[16]:





True



In [17]:

    
result3 = df.eval('(A + B) / (C - 1)')
np.allclose(result1, result3)









    Out[17]:





True



In [18]:

    
df.eval('D = (A + B) / C', inplace=True)
df.head()



In [19]:

    
df.eval('D = (A - B) / C', inplace=True)
df.head()



In [20]:

    
column_mean = df.mean(1)
result1 = df['A'] + column_mean
result2 = df.eval('A + @column_mean')
np.allclose(result1, result2)









    Out[20]:





True



In [21]:

    
result1 = df[(df.A < 0.5) & (df.B < 0.5)]
result2 = pd.eval('df[(df.A < 0.5) & (df.B < 0.5)]')
np.allclose(result1, result2)









    Out[21]:





True



In [22]:

    
result2 = df.query('A < 0.5 and B < 0.5')
np.allclose(result1, result2)









    Out[22]:





True



In [23]:

    
Cmean = df['C'].mean()
result1 = df[(df.A < Cmean) & (df.B < Cmean)]
result2 = df.query('A < @Cmean and B < @Cmean')
np.allclose(result1, result2)









    Out[23]:





True



In [ ]:

    
# The benefit of eval/query is mainly in the saved memory, and the sometimes cleaner syntax they offer.

	foo	bar
a	0.456953	0.866265
b	0.237729	0.044402
c	0.783007	0.848167

	Q	S	T
0	-5	-6	-4
1	-4	-2	2
2	5	2	7

	2000	2010
California	0.273594	0.249211
New York	0.247010	0.222831
Texas	0.283251	0.273568

		data1	data2
a	1	0.734581	0.739059
a	2	0.722357	0.891815
b	1	0.687610	0.685793
b	2	0.464258	0.567720

	A	B
0	0.155995	0.020584
1	0.058084	0.969910
2	0.866176	0.832443
3	0.601115	0.212339
4	0.708073	0.181825

	area	population
California	423967	38332521
Florida	170312	19552860
Illinois	149995	12882135
New York	141297	19651127
Texas	695662	26448193

	California	Florida	Illinois	New York	Texas
area	4.239670e+05	1.703120e+05	1.499950e+05	1.412970e+05	6.956620e+05
pop	3.833252e+07	1.955286e+07	1.288214e+07	1.965113e+07	2.644819e+07
density	9.041393e+01	1.148061e+02	8.588376e+01	1.390767e+02	3.801874e+01

	A	B	C	D
0	-1.000000	7.071068e-01	1.000000	-1.000000e+00
1	-0.707107	1.224647e-16	0.707107	-7.071068e-01
2	-0.707107	1.000000e+00	-0.707107	1.224647e-16

	2000	2010
California	33871648	37253956
New York	18976457	19378102
Texas	20851820	25145561

	subject	Bob		Guido		Sue
	type	HR	Temp	HR	Temp	HR	Temp
year	visit
2013	1	27.0	36.7	24.0	34.6	37.0	37.4
2013	2	61.0	38.5	42.0	37.1	27.0	36.2
2014	1	32.0	37.1	38.0	37.6	35.0	38.1
2014	2	37.0	36.3	19.0	37.1	38.0	36.6

year	2000	2010
state
California	33871648	37253956
New York	18976457	19378102
Texas	20851820	25145561

	employee	group	hire_date
0	Bob	Accounting	2008
1	Jake	Engineering	2012
2	Lisa	Engineering	2004
3	Sue	HR	2014

	employee	group	salary
0	Bob	Accounting	70000
1	Jake	Engineering	80000
2	Lisa	Engineering	120000
3	Sue	HR	90000

	state/region	ages	year	population	state
0	AL	under18	2012	1117489.0	Alabama
1	AL	total	2012	4817528.0	Alabama
2	AL	under18	2010	1130966.0	Alabama
3	AL	total	2010	4785570.0	Alabama
4	AL	under18	2011	1125763.0	Alabama

	state/region	ages	year	population	state
2448	PR	under18	1990	NaN	NaN
2449	PR	total	1990	NaN	NaN
2450	PR	total	1991	NaN	NaN
2451	PR	under18	1991	NaN	NaN
2452	PR	total	1993	NaN	NaN

	method	number	orbital_period	mass	distance	year
0	Radial Velocity	1	269.300	7.10	77.40	2006
1	Radial Velocity	1	874.774	2.21	56.95	2008
2	Radial Velocity	1	763.000	2.60	19.84	2011
3	Radial Velocity	1	326.030	19.40	110.62	2007
4	Radial Velocity	1	516.220	10.50	119.47	2009

	number	orbital_period	mass	distance	year
count	498.00000	498.000000	498.000000	498.000000	498.000000
mean	1.73494	835.778671	2.509320	52.068213	2007.377510
std	1.17572	1469.128259	3.636274	46.596041	4.167284
min	1.00000	1.328300	0.003600	1.350000	1989.000000
25%	1.00000	38.272250	0.212500	24.497500	2005.000000
50%	1.00000	357.000000	1.245000	39.940000	2009.000000
75%	2.00000	999.600000	2.867500	59.332500	2011.000000
max	6.00000	17337.500000	25.000000	354.000000	2014.000000

	count	mean	std	min	25%	50%	75%	max
method
Astrometry	2.0	2011.500000	2.121320	2010.0	2010.75	2011.5	2012.25	2013.0
Eclipse Timing Variations	9.0	2010.000000	1.414214	2008.0	2009.00	2010.0	2011.00	2012.0
Imaging	38.0	2009.131579	2.781901	2004.0	2008.00	2009.0	2011.00	2013.0
Microlensing	23.0	2009.782609	2.859697	2004.0	2008.00	2010.0	2012.00	2013.0
Orbital Brightness Modulation	3.0	2011.666667	1.154701	2011.0	2011.00	2011.0	2012.00	2013.0
Pulsar Timing	5.0	1998.400000	8.384510	1992.0	1992.00	1994.0	2003.00	2011.0
Pulsation Timing Variations	1.0	2007.000000	NaN	2007.0	2007.00	2007.0	2007.00	2007.0
Radial Velocity	553.0	2007.518987	4.249052	1989.0	2005.00	2009.0	2011.00	2014.0
Transit	397.0	2011.236776	2.077867	2002.0	2010.00	2012.0	2013.00	2014.0
Transit Timing Variations	4.0	2012.500000	1.290994	2011.0	2011.75	2012.5	2013.25	2014.0

	data1			data2
	min	median	max	min	median	max
key
A	0	1.5	3	3	4.0	5
B	1	2.5	4	0	3.5	7
C	2	3.5	5	3	6.0	9

	survived	pclass	sex	age	sibsp	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
0	0	3	male	22.0	1	7.2500	S	Third	man	True	NaN	Southampton	no	False
1	1	1	female	38.0	1	71.2833	C	First	woman	False	C	Cherbourg	yes	False
2	1	3	female	26.0	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True
3	1	1	female	35.0	1	53.1000	S	First	woman	False	C	Southampton	yes	False
4	0	3	male	35.0	0	8.0500	S	Third	man	True	NaN	Southampton	no	True

class	First	Second	Third
sex
female	0.968085	0.921053	0.500000
male	0.368852	0.157407	0.135447

	class	First	Second	Third
sex	age
female	(0, 18]	0.909091	1.000000	0.511628
female	(18, 80]	0.972973	0.900000	0.423729
male	(0, 18]	0.800000	0.600000	0.215686
male	(18, 80]	0.375000	0.071429	0.133663

	fare	[0, 14.454]			(14.454, 512.329]
	class	First	Second	Third	First	Second	Third
sex	age
female	(0, 18]	NaN	1.000000	0.714286	0.909091	1.000000	0.318182
female	(18, 80]	NaN	0.880000	0.444444	0.972973	0.914286	0.391304
male	(0, 18]	NaN	0.000000	0.260870	0.800000	0.818182	0.178571
male	(18, 80]	0.0	0.098039	0.125000	0.391304	0.030303	0.192308

	year	month	day	gender	births
0	1969	1	1	F	4046
1	1969	1	1	M	4440
2	1969	1	2	F	4454
3	1969	1	2	M	4548
4	1969	1	3	F	4548

gender	F	M
decade
1960	1753634	1846572
1970	16263075	17121550
1980	18310351	19243452
1990	19479454	20420553
2000	18229309	19106428

	info	name
0	B\|C\|D	Graham Chapman
1	B\|D	John Cleese
2	A\|C	Terry Gilliam
3	B\|D	Eric Idle
4	B\|C	Terry Jones
5	B\|C\|D	Michael Palin

	cookTime	datePublished	description	image	ingredients	name	prepTime	recipeYield	url
0	PT	2013-04-01	Got leftover Easter eggs? Got leftover East...	http://static.thepioneerwoman.com/cooking/file...	12 whole Hard Boiled Eggs\n1/2 cup Mayonnaise\...	Easter Leftover Sandwich	PT15M	8	http://thepioneerwoman.com/cooking/2013/04/eas...
1	PT10M	2011-06-06	I finally have basil in my garden. Basil I can...	http://static.thepioneerwoman.com/cooking/file...	3/4 cups Fresh Basil Leaves\n1/2 cup Grated Pa...	Pasta with Pesto Cream Sauce	PT6M	8	http://thepioneerwoman.com/cooking/2011/06/pas...
2	PT15M	2011-09-15	This was yummy. And easy. And pretty! And it t...	http://static.thepioneerwoman.com/cooking/file...	2 whole Pork Tenderloins\n Salt And Pepper, to...	Herb Roasted Pork Tenderloin with Preserves	PT5M	12	http://thepioneerwoman.com/cooking/2011/09/her...
3	PT20M	2012-04-23	I made this for a late lunch Saturday, and it ...	http://static.thepioneerwoman.com/cooking/file...	1 pound Penne\n4 whole Boneless, Skinless Chic...	Chicken Florentine Pasta	PT10M	10	http://thepioneerwoman.com/cooking/2012/04/chi...
4	PT	2011-06-13	Iced coffee is my life. When I wake up, often ...	http://static.thepioneerwoman.com/cooking/file...	1 pound Ground Coffee (good, Rich Roast)\n8 qu...	Perfect Iced Coffee	PT8H	24	http://thepioneerwoman.com/cooking/2011/06/per...

	Open	High	Low	Close	Volume
Date
2004-08-19	49.96	51.98	47.93	50.12	-
2004-08-20	50.69	54.49	50.20	54.10	-
2004-08-23	55.32	56.68	54.47	54.65	-
2004-08-24	55.56	55.74	51.73	52.38	-
2004-08-25	52.43	53.95	51.89	52.95	-

	Fremont Bridge West Sidewalk	Fremont Bridge East Sidewalk
Date
2012-10-03 00:00:00	4.0	9.0
2012-10-03 01:00:00	4.0	6.0
2012-10-03 02:00:00	1.0	1.0
2012-10-03 03:00:00	2.0	3.0
2012-10-03 04:00:00	6.0	1.0

	west	east	total
count	27322.000000	27322.000000	27322.000000
mean	56.330210	53.597906	109.928117
std	71.597664	76.477755	134.082446
min	0.000000	0.000000	0.000000
25%	7.000000	7.000000	16.000000
50%	32.000000	28.000000	63.000000
75%	75.000000	66.000000	145.000000
max	698.000000	667.000000	946.000000

	west	east	total
00:00:00	6.063213	7.577700	13.640913
01:00:00	3.409131	3.888499	7.297629
02:00:00	2.673703	2.402814	5.076517
03:00:00	1.919227	1.561018	3.480246
04:00:00	3.588235	2.618086	6.206321
05:00:00	16.401229	8.104478	24.505707
06:00:00	59.928885	29.070237	88.999122
07:00:00	155.978929	59.007024	214.985953
08:00:00	209.612818	93.088674	302.701493
09:00:00	117.459578	68.055360	185.514938
10:00:00	58.658172	40.335677	98.993849
11:00:00	49.271529	37.268014	86.539543
12:00:00	51.216169	41.634446	92.850615
13:00:00	53.350307	46.288850	99.639157
14:00:00	56.187006	52.187884	108.374890
15:00:00	64.713784	68.606673	133.320457
16:00:00	87.612478	123.700351	211.312830
17:00:00	119.531634	248.418278	367.949912
18:00:00	98.266257	171.579965	269.846221
19:00:00	54.417399	78.551845	132.969244
20:00:00	32.968366	43.506151	76.474517
21:00:00	22.920914	26.985062	49.905975
22:00:00	15.503515	18.718805	34.222320
23:00:00	10.215290	13.340070	23.555360

		west	east	total
Weekday	00:00:00	5.386224	6.364084	11.750308
	01:00:00	2.805658	3.073801	5.879459
	02:00:00	2.306273	1.810578	4.116851
	03:00:00	1.656827	1.446494	3.103321
	04:00:00	4.226322	2.756458	6.982780
	05:00:00	21.195572	10.022140	31.217712
	06:00:00	80.421894	37.075031	117.496925
	07:00:00	211.242312	76.934809	288.177122
	08:00:00	280.177122	119.924969	400.102091
	09:00:00	145.750000	79.849754	225.599754
	10:00:00	58.832512	39.235222	98.067734
	11:00:00	42.711823	32.000000	74.711823
	12:00:00	42.684729	35.330049	78.014778
	13:00:00	44.654367	39.847478	84.501845
	14:00:00	48.233702	47.306273	95.539975
	15:00:00	60.430504	70.853629	131.284133
	16:00:00	95.327183	149.547355	244.874539
	17:00:00	144.156212	327.441574	471.597786
	18:00:00	119.381304	224.337023	343.718327
	19:00:00	64.079951	98.837638	162.917589
	20:00:00	37.548585	52.265683	89.814268
	21:00:00	26.088561	31.375154	57.463715
	22:00:00	17.169742	20.704797	37.874539
	23:00:00	10.799508	14.226322	25.025830
Weekend	00:00:00	7.751534	10.604294	18.355828
	01:00:00	4.914110	5.920245	10.834356
	02:00:00	3.595679	3.888889	7.484568
	03:00:00	2.573620	1.846626	4.420245
	04:00:00	1.996933	2.273006	4.269939
	05:00:00	4.444785	3.322086	7.766871
	06:00:00	8.822086	9.107362	17.929448
	07:00:00	18.159509	14.297546	32.457055
	08:00:00	33.634969	26.162577	59.797546
	09:00:00	46.993865	38.677914	85.671779
	10:00:00	58.223926	43.076687	101.300613
	11:00:00	65.610429	50.389571	116.000000
	12:00:00	72.466258	57.337423	129.803681
	13:00:00	75.036810	62.352761	137.389571
	14:00:00	76.021472	64.361963	140.383436
	15:00:00	75.395706	63.003067	138.398773
	16:00:00	68.313846	59.043077	127.356923
	17:00:00	57.932308	50.738462	108.670769
	18:00:00	45.446154	39.606154	85.052308
	19:00:00	30.246154	27.806154	58.052308
	20:00:00	21.510769	21.593846	43.104615
	21:00:00	14.996923	16.003077	31.000000
	22:00:00	11.335385	13.750769	25.086154
	23:00:00	8.753846	11.123077	19.876923

	A	B	C
0	0.375506	0.406939	0.069938
1	0.069087	0.235615	0.154374
2	0.677945	0.433839	0.652324
3	0.264038	0.808055	0.347197
4	0.589161	0.252418	0.557789